added is_utf8() check to _buffer_is_binary() so that utf8 comments inside z85 encoded data doesn't count as binary

2026-02-04 15:10:59 +01:00 · 2014-02-25 11:08:59 +01:00
parent 99f611ab80
commit cbc45f5fa1
3 changed files with 120 additions and 22 deletions
--- a/1
+++ b/1
@@ -44,6 +44,7 @@ Z85 headers:
   is marked as to be ignored which could be saved as a state when using
   blockmode.
 Check is_utf8 license.
 Python binding, e.g.:
 py % cdll.LoadLibrary("libsodium.so.8")
--- a/include/pcp/z85.h
+++ b/include/pcp/z85.h
@@ -57,7 +57,7 @@ we pad the input with zeroes and remove them after decoding.
    \return Returns a pointer to the padded data.
 */
-unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen);
+byte *pcp_padfour(byte *src, size_t srclen, size_t *dstlen);
 /** Unpad padded input data.
@@ -70,7 +70,7 @@ unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen);
    \return Returns the unpadded size of the data.
 */
-size_t pcp_unpadfour(unsigned char *src, size_t srclen);
+size_t pcp_unpadfour(byte *src, size_t srclen);
 /** Decode data from Z85 encoding.
@@ -83,7 +83,7 @@ size_t pcp_unpadfour(unsigned char *src, size_t srclen);
            returns NULL. Check fatals_if_any().
 */
-unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen);
+byte *pcp_z85_decode(char *z85block, size_t *dstlen);
 /** Encode data to Z85 encoding.
@@ -96,7 +96,7 @@ unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen);
    \return Returns a string (char array) containing the Z85 encoded data.
 */
-char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen);
+char *pcp_z85_encode(byte *raw, size_t srclen, size_t *dstlen);
 /** Read a Z85 encoded file.
@@ -120,7 +120,21 @@ char *pcp_readz85file(FILE *infile);
    \return Raw Z85 encoded string with comments, headers and newlines removed.
 */
-char *pcp_readz85string(unsigned char *input, size_t bufsize);
+char *pcp_readz85string(byte *input, size_t bufsize);
 /** Check if a binary array is utf8.
    Based on http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
    by Christoph Gärtner
    Modified to only check for one utf8 char. The given sequence must have
    at least 4 bytes length. No boundary checks are being made.
    \param[in] bytes A byte sequence with 4 or more bytes length.
    \return Returns 0 if the sequence is not utf8 or a number greater than 1 indicating the size of the utf8 char.
 */
 uint8_t is_utf8(const byte * string);
 /** Determine if a buffer is binary or ascii.
@@ -129,7 +143,7 @@ char *pcp_readz85string(unsigned char *input, size_t bufsize);
    \return Returns 0 if the input is ascii or a number > 0 if
            it contains binary data.
 */
-size_t _buffer_is_binary(unsigned char *buf, size_t len);
+size_t _buffer_is_binary(byte *buf, size_t len);
 /** Determine if a char is a Z85 character
--- a/libpcp/z85.c
+++ b/libpcp/z85.c
@@ -22,19 +22,102 @@
 #include "z85.h"
-size_t _buffer_is_binary(unsigned char *buf, size_t len) {
+uint8_t is_utf8(const byte *bytes) {
  if( (// non-overlong 2-byte
       (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
       (0x80 <= bytes[1] && bytes[1] <= 0xBF)
       )
      ) {
    return 2;
  }
  if( (// excluding overlongs
       bytes[0] == 0xE0 &&
       (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
       (0x80 <= bytes[2] && bytes[2] <= 0xBF)
       ) ||
      (// straight 3-byte
       ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
 	bytes[0] == 0xEE ||
 	bytes[0] == 0xEF) &&
       (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
       (0x80 <= bytes[2] && bytes[2] <= 0xBF)
       ) ||
      (// excluding surrogates
       bytes[0] == 0xED &&
       (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
       (0x80 <= bytes[2] && bytes[2] <= 0xBF)
       )
      ) {
    return 3;
  }
  if( (// planes 1-3
       bytes[0] == 0xF0 &&
       (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
       (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
       (0x80 <= bytes[3] && bytes[3] <= 0xBF)
       ) ||
      (// planes 4-15
       (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
       (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
       (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
       (0x80 <= bytes[3] && bytes[3] <= 0xBF)
       ) ||
      (// plane 16
       bytes[0] == 0xF4 &&
       (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
       (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
       (0x80 <= bytes[3] && bytes[3] <= 0xBF)
       )
      ) {
    return 4;
  }
  return 0;
 }
 size_t _buffer_is_binary(byte *buf, size_t len) {
  size_t pos;
  byte wide[4] = {0};
  uint8_t utf = 0;
  int i;
  /* start at 1, to circumvent returning 0 if we find a match at position 0,
     which would lead the caller to believe the buffer is not binary */
  for (pos=1; pos<len; ++pos) {
    if(buf[pos] == '\0' || (buf[pos] != '\r' && buf[pos] != '\n' && isprint(buf[pos]) == 0)) {
-      break;
+      /* it's probably a binary char */
      /* check for utf8 */
      wide[0] = buf[pos];
      for(i=1; i<3; i++) {
 	/* check for 2, 3 or 4 byte utf8 char */
 	if(pos+i < len) {
 	  /* only if there's enough space of course */
 	  wide[i] = buf[pos+i];
 	  if(is_utf8(wide) > 1) {
 	    pos += i; /* jump over the utf we already found */
 	    utf = 1;
 	  }
 	}
 	else 
 	  break;
      }
      if(utf == 1) {
 	/* it's a utf8 char, continue checking, reset wide */
 	memset(wide, 0, 4);
 	continue;
      }
      break; /* if we reach this, then it's binary and not utf8, stop checking */
    }
  }
  if(pos < len)
-    return pos;
+    return pos; /* binary */
  else
-    return 0;
+    return 0; /* text */
 }
 uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) {
@@ -52,15 +135,15 @@ uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) {
  return is_comment;
 }
-unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen) {
+byte *pcp_padfour(byte *src, size_t srclen, size_t *dstlen) {
  size_t outlen, zerolen;
-  unsigned char *dst;
+  byte *dst;
  outlen = srclen;
  while (outlen % 4 != 0) outlen++;
  zerolen = outlen - srclen;
-  dst = (unsigned char*)ucmalloc(outlen);
+  dst = (byte*)ucmalloc(outlen);
  memcpy(dst, src, srclen); /*  add the original */
  memset(&dst[srclen], 0, zerolen); /*  pad with zeroes  */
@@ -69,7 +152,7 @@ unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen) {
  return dst;
 }
-size_t pcp_unpadfour(unsigned char *src, size_t srclen) {
+size_t pcp_unpadfour(byte *src, size_t srclen) {
  size_t outlen;
  size_t i;
@@ -85,8 +168,8 @@ size_t pcp_unpadfour(unsigned char *src, size_t srclen) {
  return outlen;
 }
-unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen) {
+byte *pcp_z85_decode(char *z85block, size_t *dstlen) {
-  unsigned char *bin = NULL;
+  byte *bin = NULL;
  size_t binlen, outlen; 
  binlen = strlen(z85block) * 4 / 5; 
@@ -104,12 +187,12 @@ unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen) {
  return bin;
 }
-char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen) {
+char *pcp_z85_encode(byte *raw, size_t srclen, size_t *dstlen) {
  int pos = 0;
  size_t outlen, blocklen, zlen;
  /*  make z85 happy (size % 4) */
-  unsigned char *padded = pcp_padfour(raw, srclen, &outlen);
+  byte *padded = pcp_padfour(raw, srclen, &outlen);
  /*  encode to z85 */
  zlen = (outlen * 5 / 4) + 1;
@@ -146,10 +229,10 @@ char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen) {
 char *pcp_readz85file(FILE *infile) {
-  unsigned char *input = NULL;
+  byte *input = NULL;
-  unsigned char *tmp = NULL;
+  byte *tmp = NULL;
  size_t bufsize = 0;
-  unsigned char byte[1];
+  byte byte[1];
  while(!feof(infile)) {
    if(!fread(&byte, 1, 1, infile))
@@ -171,7 +254,7 @@ char *pcp_readz85file(FILE *infile) {
  return pcp_readz85string(input, bufsize);
 }
-char *pcp_readz85string(unsigned char *input, size_t bufsize) {
+char *pcp_readz85string(byte *input, size_t bufsize) {
  int i;
  size_t MAXLINE = 1024;