diff --git a/TODO b/TODO index f43d284..c4daa01 100644 --- a/TODO +++ b/TODO @@ -44,6 +44,7 @@ Z85 headers: is marked as to be ignored which could be saved as a state when using blockmode. +Check is_utf8 license. Python binding, e.g.: py % cdll.LoadLibrary("libsodium.so.8") diff --git a/include/pcp/z85.h b/include/pcp/z85.h index a82e3b6..2257788 100644 --- a/include/pcp/z85.h +++ b/include/pcp/z85.h @@ -57,7 +57,7 @@ we pad the input with zeroes and remove them after decoding. \return Returns a pointer to the padded data. */ -unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen); +byte *pcp_padfour(byte *src, size_t srclen, size_t *dstlen); /** Unpad padded input data. @@ -70,7 +70,7 @@ unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen); \return Returns the unpadded size of the data. */ -size_t pcp_unpadfour(unsigned char *src, size_t srclen); +size_t pcp_unpadfour(byte *src, size_t srclen); /** Decode data from Z85 encoding. @@ -83,7 +83,7 @@ size_t pcp_unpadfour(unsigned char *src, size_t srclen); returns NULL. Check fatals_if_any(). */ -unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen); +byte *pcp_z85_decode(char *z85block, size_t *dstlen); /** Encode data to Z85 encoding. @@ -96,7 +96,7 @@ unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen); \return Returns a string (char array) containing the Z85 encoded data. */ -char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen); +char *pcp_z85_encode(byte *raw, size_t srclen, size_t *dstlen); /** Read a Z85 encoded file. @@ -120,7 +120,21 @@ char *pcp_readz85file(FILE *infile); \return Raw Z85 encoded string with comments, headers and newlines removed. */ -char *pcp_readz85string(unsigned char *input, size_t bufsize); +char *pcp_readz85string(byte *input, size_t bufsize); + +/** Check if a binary array is utf8. + + Based on http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c + by Christoph Gärtner + + Modified to only check for one utf8 char. The given sequence must have + at least 4 bytes length. No boundary checks are being made. + + \param[in] bytes A byte sequence with 4 or more bytes length. + + \return Returns 0 if the sequence is not utf8 or a number greater than 1 indicating the size of the utf8 char. +*/ +uint8_t is_utf8(const byte * string); /** Determine if a buffer is binary or ascii. @@ -129,7 +143,7 @@ char *pcp_readz85string(unsigned char *input, size_t bufsize); \return Returns 0 if the input is ascii or a number > 0 if it contains binary data. */ -size_t _buffer_is_binary(unsigned char *buf, size_t len); +size_t _buffer_is_binary(byte *buf, size_t len); /** Determine if a char is a Z85 character diff --git a/libpcp/z85.c b/libpcp/z85.c index 5ab75dd..f795511 100644 --- a/libpcp/z85.c +++ b/libpcp/z85.c @@ -22,19 +22,102 @@ #include "z85.h" -size_t _buffer_is_binary(unsigned char *buf, size_t len) { +uint8_t is_utf8(const byte *bytes) { + if( (// non-overlong 2-byte + (0xC2 <= bytes[0] && bytes[0] <= 0xDF) && + (0x80 <= bytes[1] && bytes[1] <= 0xBF) + ) + ) { + return 2; + } + + if( (// excluding overlongs + bytes[0] == 0xE0 && + (0xA0 <= bytes[1] && bytes[1] <= 0xBF) && + (0x80 <= bytes[2] && bytes[2] <= 0xBF) + ) || + (// straight 3-byte + ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) || + bytes[0] == 0xEE || + bytes[0] == 0xEF) && + (0x80 <= bytes[1] && bytes[1] <= 0xBF) && + (0x80 <= bytes[2] && bytes[2] <= 0xBF) + ) || + (// excluding surrogates + bytes[0] == 0xED && + (0x80 <= bytes[1] && bytes[1] <= 0x9F) && + (0x80 <= bytes[2] && bytes[2] <= 0xBF) + ) + ) { + return 3; + } + + if( (// planes 1-3 + bytes[0] == 0xF0 && + (0x90 <= bytes[1] && bytes[1] <= 0xBF) && + (0x80 <= bytes[2] && bytes[2] <= 0xBF) && + (0x80 <= bytes[3] && bytes[3] <= 0xBF) + ) || + (// planes 4-15 + (0xF1 <= bytes[0] && bytes[0] <= 0xF3) && + (0x80 <= bytes[1] && bytes[1] <= 0xBF) && + (0x80 <= bytes[2] && bytes[2] <= 0xBF) && + (0x80 <= bytes[3] && bytes[3] <= 0xBF) + ) || + (// plane 16 + bytes[0] == 0xF4 && + (0x80 <= bytes[1] && bytes[1] <= 0x8F) && + (0x80 <= bytes[2] && bytes[2] <= 0xBF) && + (0x80 <= bytes[3] && bytes[3] <= 0xBF) + ) + ) { + return 4; + } + + return 0; +} + +size_t _buffer_is_binary(byte *buf, size_t len) { size_t pos; + byte wide[4] = {0}; + uint8_t utf = 0; + int i; + /* start at 1, to circumvent returning 0 if we find a match at position 0, which would lead the caller to believe the buffer is not binary */ for (pos=1; pos 1) { + pos += i; /* jump over the utf we already found */ + utf = 1; + } + } + else + break; + } + + if(utf == 1) { + /* it's a utf8 char, continue checking, reset wide */ + memset(wide, 0, 4); + continue; + } + + break; /* if we reach this, then it's binary and not utf8, stop checking */ } } if(pos < len) - return pos; + return pos; /* binary */ else - return 0; + return 0; /* text */ } uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) { @@ -52,15 +135,15 @@ uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) { return is_comment; } -unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen) { +byte *pcp_padfour(byte *src, size_t srclen, size_t *dstlen) { size_t outlen, zerolen; - unsigned char *dst; + byte *dst; outlen = srclen; while (outlen % 4 != 0) outlen++; zerolen = outlen - srclen; - dst = (unsigned char*)ucmalloc(outlen); + dst = (byte*)ucmalloc(outlen); memcpy(dst, src, srclen); /* add the original */ memset(&dst[srclen], 0, zerolen); /* pad with zeroes */ @@ -69,7 +152,7 @@ unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen) { return dst; } -size_t pcp_unpadfour(unsigned char *src, size_t srclen) { +size_t pcp_unpadfour(byte *src, size_t srclen) { size_t outlen; size_t i; @@ -85,8 +168,8 @@ size_t pcp_unpadfour(unsigned char *src, size_t srclen) { return outlen; } -unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen) { - unsigned char *bin = NULL; +byte *pcp_z85_decode(char *z85block, size_t *dstlen) { + byte *bin = NULL; size_t binlen, outlen; binlen = strlen(z85block) * 4 / 5; @@ -104,12 +187,12 @@ unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen) { return bin; } -char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen) { +char *pcp_z85_encode(byte *raw, size_t srclen, size_t *dstlen) { int pos = 0; size_t outlen, blocklen, zlen; /* make z85 happy (size % 4) */ - unsigned char *padded = pcp_padfour(raw, srclen, &outlen); + byte *padded = pcp_padfour(raw, srclen, &outlen); /* encode to z85 */ zlen = (outlen * 5 / 4) + 1; @@ -146,10 +229,10 @@ char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen) { char *pcp_readz85file(FILE *infile) { - unsigned char *input = NULL; - unsigned char *tmp = NULL; + byte *input = NULL; + byte *tmp = NULL; size_t bufsize = 0; - unsigned char byte[1]; + byte byte[1]; while(!feof(infile)) { if(!fread(&byte, 1, 1, infile)) @@ -171,7 +254,7 @@ char *pcp_readz85file(FILE *infile) { return pcp_readz85string(input, bufsize); } -char *pcp_readz85string(unsigned char *input, size_t bufsize) { +char *pcp_readz85string(byte *input, size_t bufsize) { int i; size_t MAXLINE = 1024;