mirror of
https://codeberg.org/scip/pcp.git
synced 2025-12-17 12:00:56 +01:00
added is_utf8() check to _buffer_is_binary() so that utf8 comments inside z85 encoded data doesn't count as binary
This commit is contained in:
1
TODO
1
TODO
@@ -44,6 +44,7 @@ Z85 headers:
|
|||||||
is marked as to be ignored which could be saved as a state when using
|
is marked as to be ignored which could be saved as a state when using
|
||||||
blockmode.
|
blockmode.
|
||||||
|
|
||||||
|
Check is_utf8 license.
|
||||||
|
|
||||||
Python binding, e.g.:
|
Python binding, e.g.:
|
||||||
py % cdll.LoadLibrary("libsodium.so.8")
|
py % cdll.LoadLibrary("libsodium.so.8")
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ we pad the input with zeroes and remove them after decoding.
|
|||||||
|
|
||||||
\return Returns a pointer to the padded data.
|
\return Returns a pointer to the padded data.
|
||||||
*/
|
*/
|
||||||
unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen);
|
byte *pcp_padfour(byte *src, size_t srclen, size_t *dstlen);
|
||||||
|
|
||||||
|
|
||||||
/** Unpad padded input data.
|
/** Unpad padded input data.
|
||||||
@@ -70,7 +70,7 @@ unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen);
|
|||||||
|
|
||||||
\return Returns the unpadded size of the data.
|
\return Returns the unpadded size of the data.
|
||||||
*/
|
*/
|
||||||
size_t pcp_unpadfour(unsigned char *src, size_t srclen);
|
size_t pcp_unpadfour(byte *src, size_t srclen);
|
||||||
|
|
||||||
/** Decode data from Z85 encoding.
|
/** Decode data from Z85 encoding.
|
||||||
|
|
||||||
@@ -83,7 +83,7 @@ size_t pcp_unpadfour(unsigned char *src, size_t srclen);
|
|||||||
returns NULL. Check fatals_if_any().
|
returns NULL. Check fatals_if_any().
|
||||||
|
|
||||||
*/
|
*/
|
||||||
unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen);
|
byte *pcp_z85_decode(char *z85block, size_t *dstlen);
|
||||||
|
|
||||||
|
|
||||||
/** Encode data to Z85 encoding.
|
/** Encode data to Z85 encoding.
|
||||||
@@ -96,7 +96,7 @@ unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen);
|
|||||||
|
|
||||||
\return Returns a string (char array) containing the Z85 encoded data.
|
\return Returns a string (char array) containing the Z85 encoded data.
|
||||||
*/
|
*/
|
||||||
char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen);
|
char *pcp_z85_encode(byte *raw, size_t srclen, size_t *dstlen);
|
||||||
|
|
||||||
/** Read a Z85 encoded file.
|
/** Read a Z85 encoded file.
|
||||||
|
|
||||||
@@ -120,7 +120,21 @@ char *pcp_readz85file(FILE *infile);
|
|||||||
\return Raw Z85 encoded string with comments, headers and newlines removed.
|
\return Raw Z85 encoded string with comments, headers and newlines removed.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
char *pcp_readz85string(unsigned char *input, size_t bufsize);
|
char *pcp_readz85string(byte *input, size_t bufsize);
|
||||||
|
|
||||||
|
/** Check if a binary array is utf8.
|
||||||
|
|
||||||
|
Based on http://stackoverflow.com/questions/1031645/how-to-detect-utf-8-in-plain-c
|
||||||
|
by Christoph Gärtner
|
||||||
|
|
||||||
|
Modified to only check for one utf8 char. The given sequence must have
|
||||||
|
at least 4 bytes length. No boundary checks are being made.
|
||||||
|
|
||||||
|
\param[in] bytes A byte sequence with 4 or more bytes length.
|
||||||
|
|
||||||
|
\return Returns 0 if the sequence is not utf8 or a number greater than 1 indicating the size of the utf8 char.
|
||||||
|
*/
|
||||||
|
uint8_t is_utf8(const byte * string);
|
||||||
|
|
||||||
/** Determine if a buffer is binary or ascii.
|
/** Determine if a buffer is binary or ascii.
|
||||||
|
|
||||||
@@ -129,7 +143,7 @@ char *pcp_readz85string(unsigned char *input, size_t bufsize);
|
|||||||
\return Returns 0 if the input is ascii or a number > 0 if
|
\return Returns 0 if the input is ascii or a number > 0 if
|
||||||
it contains binary data.
|
it contains binary data.
|
||||||
*/
|
*/
|
||||||
size_t _buffer_is_binary(unsigned char *buf, size_t len);
|
size_t _buffer_is_binary(byte *buf, size_t len);
|
||||||
|
|
||||||
|
|
||||||
/** Determine if a char is a Z85 character
|
/** Determine if a char is a Z85 character
|
||||||
|
|||||||
115
libpcp/z85.c
115
libpcp/z85.c
@@ -22,19 +22,102 @@
|
|||||||
|
|
||||||
#include "z85.h"
|
#include "z85.h"
|
||||||
|
|
||||||
size_t _buffer_is_binary(unsigned char *buf, size_t len) {
|
uint8_t is_utf8(const byte *bytes) {
|
||||||
|
if( (// non-overlong 2-byte
|
||||||
|
(0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
|
||||||
|
(0x80 <= bytes[1] && bytes[1] <= 0xBF)
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( (// excluding overlongs
|
||||||
|
bytes[0] == 0xE0 &&
|
||||||
|
(0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
|
||||||
|
) ||
|
||||||
|
(// straight 3-byte
|
||||||
|
((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
|
||||||
|
bytes[0] == 0xEE ||
|
||||||
|
bytes[0] == 0xEF) &&
|
||||||
|
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
|
||||||
|
) ||
|
||||||
|
(// excluding surrogates
|
||||||
|
bytes[0] == 0xED &&
|
||||||
|
(0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
|
||||||
|
(0x80 <= bytes[2] && bytes[2] <= 0xBF)
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
if( (// planes 1-3
|
||||||
|
bytes[0] == 0xF0 &&
|
||||||
|
(0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
|
||||||
|
) ||
|
||||||
|
(// planes 4-15
|
||||||
|
(0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
|
||||||
|
(0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
|
||||||
|
) ||
|
||||||
|
(// plane 16
|
||||||
|
bytes[0] == 0xF4 &&
|
||||||
|
(0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
|
||||||
|
(0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
|
||||||
|
(0x80 <= bytes[3] && bytes[3] <= 0xBF)
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t _buffer_is_binary(byte *buf, size_t len) {
|
||||||
size_t pos;
|
size_t pos;
|
||||||
|
byte wide[4] = {0};
|
||||||
|
uint8_t utf = 0;
|
||||||
|
int i;
|
||||||
|
|
||||||
/* start at 1, to circumvent returning 0 if we find a match at position 0,
|
/* start at 1, to circumvent returning 0 if we find a match at position 0,
|
||||||
which would lead the caller to believe the buffer is not binary */
|
which would lead the caller to believe the buffer is not binary */
|
||||||
for (pos=1; pos<len; ++pos) {
|
for (pos=1; pos<len; ++pos) {
|
||||||
if(buf[pos] == '\0' || (buf[pos] != '\r' && buf[pos] != '\n' && isprint(buf[pos]) == 0)) {
|
if(buf[pos] == '\0' || (buf[pos] != '\r' && buf[pos] != '\n' && isprint(buf[pos]) == 0)) {
|
||||||
break;
|
/* it's probably a binary char */
|
||||||
|
|
||||||
|
/* check for utf8 */
|
||||||
|
wide[0] = buf[pos];
|
||||||
|
for(i=1; i<3; i++) {
|
||||||
|
/* check for 2, 3 or 4 byte utf8 char */
|
||||||
|
if(pos+i < len) {
|
||||||
|
/* only if there's enough space of course */
|
||||||
|
wide[i] = buf[pos+i];
|
||||||
|
if(is_utf8(wide) > 1) {
|
||||||
|
pos += i; /* jump over the utf we already found */
|
||||||
|
utf = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(utf == 1) {
|
||||||
|
/* it's a utf8 char, continue checking, reset wide */
|
||||||
|
memset(wide, 0, 4);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
break; /* if we reach this, then it's binary and not utf8, stop checking */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(pos < len)
|
if(pos < len)
|
||||||
return pos;
|
return pos; /* binary */
|
||||||
else
|
else
|
||||||
return 0;
|
return 0; /* text */
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) {
|
uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) {
|
||||||
@@ -52,15 +135,15 @@ uint8_t _parse_zchar(Buffer *z, uint8_t c, uint8_t is_comment) {
|
|||||||
return is_comment;
|
return is_comment;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen) {
|
byte *pcp_padfour(byte *src, size_t srclen, size_t *dstlen) {
|
||||||
size_t outlen, zerolen;
|
size_t outlen, zerolen;
|
||||||
unsigned char *dst;
|
byte *dst;
|
||||||
|
|
||||||
outlen = srclen;
|
outlen = srclen;
|
||||||
while (outlen % 4 != 0) outlen++;
|
while (outlen % 4 != 0) outlen++;
|
||||||
zerolen = outlen - srclen;
|
zerolen = outlen - srclen;
|
||||||
|
|
||||||
dst = (unsigned char*)ucmalloc(outlen);
|
dst = (byte*)ucmalloc(outlen);
|
||||||
memcpy(dst, src, srclen); /* add the original */
|
memcpy(dst, src, srclen); /* add the original */
|
||||||
memset(&dst[srclen], 0, zerolen); /* pad with zeroes */
|
memset(&dst[srclen], 0, zerolen); /* pad with zeroes */
|
||||||
|
|
||||||
@@ -69,7 +152,7 @@ unsigned char *pcp_padfour(unsigned char *src, size_t srclen, size_t *dstlen) {
|
|||||||
return dst;
|
return dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t pcp_unpadfour(unsigned char *src, size_t srclen) {
|
size_t pcp_unpadfour(byte *src, size_t srclen) {
|
||||||
size_t outlen;
|
size_t outlen;
|
||||||
size_t i;
|
size_t i;
|
||||||
|
|
||||||
@@ -85,8 +168,8 @@ size_t pcp_unpadfour(unsigned char *src, size_t srclen) {
|
|||||||
return outlen;
|
return outlen;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen) {
|
byte *pcp_z85_decode(char *z85block, size_t *dstlen) {
|
||||||
unsigned char *bin = NULL;
|
byte *bin = NULL;
|
||||||
size_t binlen, outlen;
|
size_t binlen, outlen;
|
||||||
|
|
||||||
binlen = strlen(z85block) * 4 / 5;
|
binlen = strlen(z85block) * 4 / 5;
|
||||||
@@ -104,12 +187,12 @@ unsigned char *pcp_z85_decode(char *z85block, size_t *dstlen) {
|
|||||||
return bin;
|
return bin;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen) {
|
char *pcp_z85_encode(byte *raw, size_t srclen, size_t *dstlen) {
|
||||||
int pos = 0;
|
int pos = 0;
|
||||||
size_t outlen, blocklen, zlen;
|
size_t outlen, blocklen, zlen;
|
||||||
|
|
||||||
/* make z85 happy (size % 4) */
|
/* make z85 happy (size % 4) */
|
||||||
unsigned char *padded = pcp_padfour(raw, srclen, &outlen);
|
byte *padded = pcp_padfour(raw, srclen, &outlen);
|
||||||
|
|
||||||
/* encode to z85 */
|
/* encode to z85 */
|
||||||
zlen = (outlen * 5 / 4) + 1;
|
zlen = (outlen * 5 / 4) + 1;
|
||||||
@@ -146,10 +229,10 @@ char *pcp_z85_encode(unsigned char *raw, size_t srclen, size_t *dstlen) {
|
|||||||
|
|
||||||
|
|
||||||
char *pcp_readz85file(FILE *infile) {
|
char *pcp_readz85file(FILE *infile) {
|
||||||
unsigned char *input = NULL;
|
byte *input = NULL;
|
||||||
unsigned char *tmp = NULL;
|
byte *tmp = NULL;
|
||||||
size_t bufsize = 0;
|
size_t bufsize = 0;
|
||||||
unsigned char byte[1];
|
byte byte[1];
|
||||||
|
|
||||||
while(!feof(infile)) {
|
while(!feof(infile)) {
|
||||||
if(!fread(&byte, 1, 1, infile))
|
if(!fread(&byte, 1, 1, infile))
|
||||||
@@ -171,7 +254,7 @@ char *pcp_readz85file(FILE *infile) {
|
|||||||
return pcp_readz85string(input, bufsize);
|
return pcp_readz85string(input, bufsize);
|
||||||
}
|
}
|
||||||
|
|
||||||
char *pcp_readz85string(unsigned char *input, size_t bufsize) {
|
char *pcp_readz85string(byte *input, size_t bufsize) {
|
||||||
int i;
|
int i;
|
||||||
size_t MAXLINE = 1024;
|
size_t MAXLINE = 1024;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user