wallet/src/serialization.h

namespace ro {

	//	Decay to pointer is dangerously convenient,
	//	but in some situations it is just convenient
	//	This class provides an std:array one larger
	//	than the compile time string size, which decays
	//	to char*, std::string, and wxString
	//	In some code, this is ambiguous, so casts
	//	must sometimes be explicitly invoked.
	template <unsigned int stringlen>
	class  CompileSizedString : public std::array<char, stringlen + 1>{
	public:
		static constexpr int length{ stringlen };
		CompileSizedString() {
			*(this->rbegin()) = '0';
		}
		CompileSizedString(char *psz) {
			auto tsz{ this->rbegin() };
			*tsz = '0';
			if (psz != nullptr) {
				auto usz = tsz + strlen;
				while (tsz < usz && *psz != '\0')
					*tsz++ = *psz++;
				*tsz = '\0';
			}
		}
		operator char* () & {
			char* pc = &(static_cast<std::array<char, stringlen + 1>*>(this)->operator[](0));
			return pc;
		}

		operator const char* () const& {
			const char* pc = &(static_cast<const std::array<char, stringlen + 1>*>(this)->operator[](0));
			return pc;
		}
		operator const char* () const&& {
			const char* pc = &(static_cast<const std::array<char, stringlen + 1>*>(this)->operator[](0));
			return pc;
		}
		operator std::string() const& {
			return std::string((const char*)*this, this->length);
		}
		operator std::string() const&& {
			return std::string((const char*)*this, this->length);
		}
		operator wxString() const& {
			return wxString::FromUTF8Unchecked((const char*)(*this));
		}
		operator std::span<byte>() const& {
			return std::span<byte>(static_cast<std::nullptr_t>((char*)*this), stringlen + 1);
		}
		operator wxString() const&& {
			return wxString::FromUTF8Unchecked((const char*)(*this));
		}
		operator std::span<byte>() const&& {
			return std::span<byte>(static_cast<std::nullptr_t>((char*)*this), stringlen + 1);
		}
	};

	//	This template generates a span over an indexable byte type,
	//	such as a C array or an std::array, but not pointers
	template<class T>
	concept byte_spannable = requires (T a) {
		std::size(a);
		a[0];
	} && sizeof(std::declval<T>()[0]) == 1;

	template<byte_spannable T>
	auto serialize(const T& a) {
		int l;
		const void* pt;
		if constexpr (std::is_same_v<std::remove_cvref_t<T>, std::string>) {
			l = a.length() + 1;
			pt = a.c_str();
		}
		else {
			l = std::size(a);
			pt = &a[0];
		}
		return  std::span(static_cast<const byte *>(pt), l);
	}

	//	Compile time test to see if a type has a blob array member
	//	This can be used in if constexpr (is_blob_field_type<T>::value)
	//	By convention, blob fields are an std::array of unsigned bytes
	//	therefore already serializable.
	template <class T> struct is_blob_field_type{
		template <typename U> static constexpr decltype(std::declval<U>().blob.size(), bool()) test() {
			return sizeof(std::declval<U>().blob[0])==1;
		}
		template <typename U> static constexpr bool test(int = 0) {
			return false;
		}
		static constexpr bool value = is_blob_field_type::template test<T>();
	};

	template<class T> concept blob_type = ro::is_blob_field_type<T>::value;


	//	At present our serial classes consist of std::span<uint8_t> and custom classes that publicly inherit from std::span<byte>
	//	To handle compound objects, add custom classes inheriting from std::span<byte>[n]

	//	template class that generates a std::span of bytes over the blob
	//	field of any object containing a blob record, which is normally sufficient
	//	for a machine independent representation of that object
	template <blob_type T>	std::span<const byte> serialize(const T& pt) {
		return serialize(pt.blob);
	}

	//	method that assumes that any char * pointer points a null terminated string
	//	and generates a std::span of bytes over the string including the terminating
	//	null.
	// 	we assume the string is already machine independent, which is to say, we assume
	//	it comes from a utf8 locale.

	inline auto serialize(const char* sp) {
		return std::span(static_cast<const byte*>(static_cast<std::nullptr_t>(sp)), strlen(sp) + 1); }

	inline auto serialize(const decltype(std::declval<wxString>().ToUTF8()) sz){
		return serialize(static_cast<const char*>(sz));
	}
	/* Don't do this.  Disaster ensues:
	inline auto serialize(const wxString& wxstr) {
		return serialize(static_cast<const char*>(wxstr.ToUTF8()));	}
	Instead do this:*/
	std::span<const byte>serialize(const wxString&) = delete;
	std::span<const byte>serialize(const wxString) = delete;
	std::span<const byte>serialize(wxString&) = delete;

	/*If we allowed wxwidgets string to be serializable, all sorts of surprising things
	would be serializable in surprising ways, because wxWidgets can convert all
	sorts of things into strings that you were likely not expecting, in ways
	unlikely to be machine independent, so if you give an object to be
	hashed that you have not provided some correct means for serializing, C++ is
	apt to unhelpfully and unexpectedly turn it into a wxString,

	If you make wxStrings hashable, suprising things become hashable.
	However, we do make the strange data structure provided by wxString.ToUTF8() hashable,
	so that the wxString will not be implicitly hashable, but will be explicitly hashable.
	*/

	//	data structure containing a serialized unsigned integer
	// Converts an unsigned integer to VLQ format, and creates a bytespan pointing at it.
	//	VLQ format, Variable Length Quantity  (It is a standard used by LLVM and others)
	//	On reflection, VLQ format is not convenient for the intended usage (merkle patricia trees
	//	representing SQL indexes, and a better format is to compress leading zero or leading 0xFF bytes
	//	with the length of the run being implied by a count of the bytes following the run)
	template<std::unsigned_integral T> class userial : public std::span<byte> {
	public:
		std::array<byte, (std::numeric_limits<T>::digits + 6) / 7> bblob;
		userial(T i) {
			byte* p = &bblob[0] + sizeof(bblob);
			*(--p) = i & 0x7f;
			i >>= 7;
			while (i != 0) {
				*(--p) = (i & 0x7f) | 0x80;
				i >>= 7;
			}
			assert(p >= &bblob[0]);
			*static_cast<std::span<byte>*>(this) = std::span<byte>(p, &bblob[0] + sizeof(bblob));;
		}
	};

	//	data structure containing a serialized signed integer,
	//	Converts an signed integer to VLQ format, and creates a bytespan pointing at it.
	//	VLQ format, Variable Length Quantity  (It is a standard used by LLVM and others)
	template<std::signed_integral T> class iserial : public std::span<byte> {
	public:
		std::array<byte, (std::numeric_limits<T>::digits + 7) / 7> bblob;
		iserial(T i) {
			//	Throw away the repeated leading bits, and g
			byte* p = &bblob[0] + sizeof(bblob);
			unsigned count;
			if (i < 0) {
				size_t ui = i;
				count = (std::numeric_limits<size_t>::digits - std::countl_one(ui)) / 7;
			}
			else {
				size_t ui = i;
				count = (std::numeric_limits<size_t>::digits - std::countl_zero(ui)) / 7;
			}
			*(--p) = i & 0x7f;
			while (count-- != 0) {
				i >>= 7;
				*(--p) = (i & 0x7f) | 0x80;
			}
			assert(p >= &bblob[0]);
			*static_cast<std::span<byte>*>(this) = std::span<byte>(p, &bblob[0] + sizeof(bblob));;
		}
	};

	//	converts machine dependent representation of an integer
	//	into a span pointin at a compact machine independent representation of an integer
	//	The integer is split into seven bit nibbles in big endian order
	//  (VLQ format), with the high
	//	order bit of the byte indicating that more bytes are to come.
	//	for an unsigned integer, all high order bytes of the form 0x80 are left out.
	//	for a positive signed integer, the same, except that the first byte
	//	of what is left must have zero at bit 6
	//	for a negative signed integer, all the 0xFF bytes are left out, except
	//	that the first byte of what is left must have a one bit at bit six.
	//
	//	small numbers get compressed.
	//	primarily used by hash and hsh so that the same numbers on different
	//	machines will generate the same hash
	template<std::unsigned_integral T> userial<T> serialize(T i) {
		return userial<T>(i);
		/*	we don't need all deserialize functions to have the same name,
		indeed they have to be distinct because serialized data contains
		no type information, but for the sake of template code we need all
		things that may be serialized to be serialized by the serialize
		command, so that one template can deal with any
		number of serializable types */
	}
	template<std::signed_integral T> iserial<T> serialize(T i) {
		return iserial<T>(i);
		/*	we don't need all deserialize functions to have the same name, but for the sake of template code we need all
		things that may be serialized to be serialized by the serialize command, so that one template can deal with any
		number of serializable types */
	}

//	Turns a compact machine independent representation of an uninteger
//	into a 64 bit signed integer
	template<std::signed_integral T> T deserialize(const byte* p) {
		auto oldp = p;
		T i;
		if (*p & 0x40)i = -64;
		else i = 0;
		while (*p & 0x80) {
			i = (i | (*p++ & 0x7F)) << 7;
		}
		if (p - oldp > (std::numeric_limits<int64_t>::digits + 6) / 7)throw BadDataException();
		return i | *p;
	}
	//	Turns a compact machine independent representation of an integer
	//	into a 64 bit unsigned integer
	template<std::unsigned_integral T> T deserialize(const byte * p) {
		auto oldp = p;
		T i{ 0 };
		while (*p & 0x80) {
			i = (i | (*p++ & 0x7F)) << 7;
		}
		if (p - oldp > 9)throw BadDataException();
		return i | *p;
	}

	//	Turns a compact machine independent representation of an integer
	//	into a 64 bit signed integer
	template<std::integral T> T deserialize(std::span<const byte> g) {
		byte* p = static_cast<std::nullptr_t>(&g[0]);
		T i{ deserialize<T>(p) };
		if (p > &g[0]+g.size())throw BadDataException();
		return i;
	}

	/*
		It will be about a thousand years before numbers larger than 64 bits
		appear in valid well formed input, and bad data structures have to be
		dealt with a much higher level that knows what the numbers mean,
		and deals with them according to their meaning

		Until then the low level code will arbitrarily truncate numbers larger
		than sixty four bits, but numbers larger than sixty four bits are
		permissible in input, are valid at the lowest level.

		We return uint64_t, rather than uint_fast64_t to ensure that all
		implementations misinterpret garbage and malicious input in the
		same way.
		We cannot protect against Machiavelli perverting the input, so we
		don't try very hard to prevent Murphy perverting the input,
		but we do try to prevent Machiavelli from perverting the input in
		ways that will induce peers to disagree.

		We use an explicit narrow_cast, rather than simply declaring th
		function to be uint64_t, in order to express the intent to uniformly
		force possibly garbage data being deserialized to standardized
		garbage.

		We protect against malicious and ill formed data would cause the
		system to go off the rails at a point of the enemy's choosing,
		and we protect against malicious and ill formed data that one party
		might interpret in one way, and another party might interpret in a
		different way.

		Ill formed data that just gets converted into well formed, but
		nonsense data can cause no harm that well formed nonsense data
		could not cause.

		It suffices, therefore, to ensure that all implementations misinterpret
		input containing unreasonably large numbers as the same number.

		Very large numbers are valid in themselves, but not going to be valid
		as part of valid data structures for a thousand years or so.

		The largest numbers occurring in well formed valid data will be
		currency amounts, and the total number of the smallest unit of
		currency is fixed at 2^64-1 which will suffice for a thousand years.
		Or we might allow arbitrary precision floating point with powers of
		a thousand, so that sensible numbers to a human	are represented by
		sensible numbers in the actual representation.

		secret keys, scalars are actually much larger numbers, modulo
		0x1000000000000000000000000000000014def9dea2f79cd65812631a5cf5d3ecU
		but they are represented in a different format, their binary format
		being fixed size low endian format, as 256 bit numbers, though only
		253 bits are actually needed and used, and their human readable
		format being 44 digits in a base 58 representation.*/

	//	a compile time test to check if an object class has a machine independent representation
	template <typename T, typename... Args> static constexpr bool serializable() {
		if constexpr (requires(T a) {
			serialize(a);
		}) {
			if constexpr (sizeof...(Args) > 0)  return serializable<Args...>();
			else return true;
		}
		else return false;
	};

	template<typename... Args>
	concept has_machine_independent_representation = serializable<Args...>();

	template<has_machine_independent_representation T>
	T trigger_error(T x) { return x; };

	static_assert( !has_machine_independent_representation<double>
		&& has_machine_independent_representation<std::span<const byte>, char*, std::span<const char>>,
		"concepts needed");

	template<class T> ro::CompileSizedString< (2 * sizeof(T))>bin2hex(const T& pt) {
		ro::CompileSizedString< (2 * sizeof(T))>sz;
		sodium_bin2hex(&sz[0], sizeof(pt.blob) * 2 + 1, &pt.blob[0], pt.blob.size());
		return sz;
	}

	template<class T> T hex2bin(const ro::CompileSizedString< (2 * sizeof(T))>& sz){
		T pt;
		size_t bin_len{ sizeof(T) };
		sodium_hex2bin(
			reinterpret_cast <unsigned char* const>(&pt),
			sizeof(T),
			&sz[0], 2 * sizeof(T),
			nullptr, &bin_len, nullptr
		);
		return pt;
	}

	template <class T>decltype(std::declval<T>().blob, ro::CompileSizedString < (sizeof(T) * 8 + 5) / 6>()) to_base64_string(const T& p_blob) {
		ro::CompileSizedString < (sizeof(T) * 8 + 5) / 6> sz;
		bits2base64(
			&(p_blob.blob[0]), 0, sizeof(p_blob.blob) * 8,
			std::span<char>(sz)
		);
		return sz;
	}

} //End ro namespace