added is_utf8() check to _buffer_is_binary() so that utf8 comments inside z85 encoded data doesn't count as binary

This commit is contained in:
git@daemon.de
2014-02-25 11:08:59 +01:00
parent 99f611ab80
commit cbc45f5fa1
3 changed files with 120 additions and 22 deletions

View File

@@ -22,19 +22,102 @@
#include "z85.h"
size_t _buffer_is_binary(unsigned char *buf, size_t len) {
uint8_t is_utf8(const byte *bytes) {
if( (// non-overlong 2-byte
(0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF)
)
) {
return 2;
}
if( (// excluding overlongs
bytes[0] == 0xE0 &&
(0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
) ||
(// straight 3-byte
((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
bytes[0] == 0xEE ||
bytes[0] == 0xEF) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
) ||
(// excluding surrogates
bytes[0] == 0xED &&
(0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
)
) {
return 3;
}
if( (// planes 1-3
bytes[0] == 0xF0 &&
(0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
) ||
(// planes 4-15
(0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
) ||
(// plane 16
bytes[0] == 0xF4 &&
(0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
)
) {
return 4;
}
return 0;
}
size_t _buffer_is_binary(byte *buf, size_t len) {
size_t pos;
byte wide[4] = {0};
uint8_t utf = 0;
int i;
/* start at 1, to circumvent returning 0 if we find a match at position 0,
which would lead the caller to believe the buffer is not binary */
for (pos=1; pos<len; ++pos) {
if(buf[pos] == '\0' || (buf[pos] != '\r' && buf[pos] != '\n' && isprint(buf[pos]) == 0)) {
break;
/* it's probably a binary char */
/* check for utf8 */
wide[0] = buf[pos];
for(i=1; i<3; i++) {
/* check for 2, 3 or 4 byte utf8 char */
if(pos+i < len) {
/* only if there's enough space of course */
wide[i] = buf[pos+i];
if(is_utf8(wide) > 1) {
pos += i; /* jump over the utf we already found */
utf = 1;
}
}
else
break;
}
if(utf == 1) {
/* it's a utf8 char, continue checking, reset wide */
memset(wide, 0, 4);
continue;
}
break; /* if we reach this, then it's binary and not utf8, stop checking */
}
}
if(pos < len)
return pos;
return pos; /* binary */
else
return 0;
return 0; /* text */
}
uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) {
@@ -52,15 +135,15 @@ uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) {
return is_comment;
}
unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen) {
byte *pcp_padfour(byte *src, size_t srclen, size_t *dstlen) {
size_t outlen, zerolen;
unsigned char *dst;
byte *dst;
outlen = srclen;
while (outlen % 4 != 0) outlen++;
zerolen = outlen - srclen;
dst = (unsigned char*)ucmalloc(outlen);
dst = (byte*)ucmalloc(outlen);
memcpy(dst, src, srclen); /* add the original */
memset(&dst[srclen], 0, zerolen); /* pad with zeroes */
@@ -69,7 +152,7 @@ unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen) {
return dst;
}
size_t pcp_unpadfour(unsigned char *src, size_t srclen) {
size_t pcp_unpadfour(byte *src, size_t srclen) {
size_t outlen;
size_t i;
@@ -85,8 +168,8 @@ size_t pcp_unpadfour(unsigned char *src, size_t srclen) {
return outlen;
}
unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen) {
unsigned char *bin = NULL;
byte *pcp_z85_decode(char *z85block, size_t *dstlen) {
byte *bin = NULL;
size_t binlen, outlen;
binlen = strlen(z85block) * 4 / 5;
@@ -104,12 +187,12 @@ unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen) {
return bin;
}
char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen) {
char *pcp_z85_encode(byte *raw, size_t srclen, size_t *dstlen) {
int pos = 0;
size_t outlen, blocklen, zlen;
/* make z85 happy (size % 4) */
unsigned char *padded = pcp_padfour(raw, srclen, &outlen);
byte *padded = pcp_padfour(raw, srclen, &outlen);
/* encode to z85 */
zlen = (outlen * 5 / 4) + 1;
@@ -146,10 +229,10 @@ char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen) {
char *pcp_readz85file(FILE *infile) {
unsigned char *input = NULL;
unsigned char *tmp = NULL;
byte *input = NULL;
byte *tmp = NULL;
size_t bufsize = 0;
unsigned char byte[1];
byte byte[1];
while(!feof(infile)) {
if(!fread(&byte, 1, 1, infile))
@@ -171,7 +254,7 @@ char *pcp_readz85file(FILE *infile) {
return pcp_readz85string(input, bufsize);
}
char *pcp_readz85string(unsigned char *input, size_t bufsize) {
char *pcp_readz85string(byte *input, size_t bufsize) {
int i;
size_t MAXLINE = 1024;