lexer: fix encoding of unicode surrogate pairs

Signed-off-by: Jo-Philipp Wich <jo@mein.io>
author: Jo-Philipp Wich <jo@mein.io> 2020-09-07 11:52:21 +0200
committer: Jo-Philipp Wich <jo@mein.io> 2020-09-07 16:30:34 +0200
commit: c64877172c13b89f40d310e27552bb80e1ea230f (patch)
tree: 3f19084f44a0005ebaa64d396256d1994d959b47 /lexer.c
parent: 3886dd097f07a3b57dfa884a35e2ae0e41b1b864 (diff)
1 files changed, 157 insertions, 72 deletions
diff --git a/lexer.c b/lexer.c
index 2277d67..dbac029 100644
--- a/lexer.c
+++ b/lexer.c
@@ -322,6 +322,7 @@ parse_string(const char *buf, struct ut_op *op, struct ut_state *s)
 	const char *in = buf;
 	bool esc = false;
 	int rem = sizeof(str) - 1;
+	int lead_surrogate = 0;
 	int code;
 
 	while (*in) {
@@ -331,16 +332,48 @@ parse_string(const char *buf, struct ut_op *op, struct ut_state *s)
 			if (in[0] == 'u') {
 				if (isxdigit(in[1]) && isxdigit(in[2]) &&
 				    isxdigit(in[3]) && isxdigit(in[4])) {
-					if (!utf8enc(&out, &rem,
-					             hex(in[1]) * 16 * 16 * 16 +
-					             hex(in[2]) * 16 * 16 +
-					             hex(in[3]) * 16 +
-					             hex(in[4]))) {
+					code = hex(in[1]) * 16 * 16 * 16 +
+					       hex(in[2]) * 16 * 16 +
+					       hex(in[3]) * 16 +
+					       hex(in[4]);
+
+					/* is a leading surrogate value */
+					if ((code & 0xFC00) == 0xD800) {
+						/* found a subsequent leading surrogate, ignore and emit replacement char for previous one */
+						if (lead_surrogate) {
+							if (!utf8enc(&out, &rem, 0xFFFD)) {
+								s->off += (in - buf);
+
+								return -UT_ERROR_OVERLONG_STRING;
+							}
+						}
+
+						/* store surrogate value and advance to next escape sequence */
+						lead_surrogate = code;
+						goto next;
+					}
+
+					/* is a trailing surrogate value */
+					else if ((code & 0xFC00) == 0xDC00) {
+						/* found a trailing surrogate following a leading one, combine and encode */
+						if (lead_surrogate) {
+							code = 0x10000 + ((lead_surrogate & 0x3FF) << 10) + (code & 0x3FF);
+							lead_surrogate = 0;
+						}
+
+						/* trailing surrogate not following a leading one, ignore and use replacement char */
+						else {
+							code = 0xFFFD;
+						}
+					}
+
+					if (!utf8enc(&out, &rem, code)) {
 						s->off += (in - buf);
 
 						return -UT_ERROR_OVERLONG_STRING;
 					}
 
+next:
 					in += 5;
 				}
 				else {
@@ -350,122 +383,174 @@ parse_string(const char *buf, struct ut_op *op, struct ut_state *s)
 				}
 			}
 
-			/* \xFF */
-			else if (in[0] == 'x') {
-				if (isxdigit(in[1]) && isxdigit(in[2])) {
-					if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2]))) {
+			/* other escape sequences */
+			else {
+				/* found any non-utf8 escape sequence following a leading unicode surrogate,
+				   emit replacement character and skip surrogate. */
+				if (lead_surrogate) {
+					if (!utf8enc(&out, &rem, 0xFFFD)) {
 						s->off += (in - buf);
 
 						return -UT_ERROR_OVERLONG_STRING;
 					}
 
-					in += 3;
-				}
-				else {
-					s->off += (in - buf);
-					return -UT_ERROR_INVALID_ESCAPE;
+					lead_surrogate = 0;
 				}
-			}
 
-			/* \377, \77 or \7 */
-			else if (in[0] >= '0' && in[0] <= '7') {
-				/* \377 */
-				if (in[1] >= '0' && in[1] <= '7' &&
-				    in[2] >= '0' && in[2] <= '7') {
-					code = dec(in[0]) * 8 * 8 +
-					       dec(in[1]) * 8 +
-					       dec(in[2]);
+				/* \xFF */
+				if (in[0] == 'x') {
+
+
+					if (isxdigit(in[1]) && isxdigit(in[2])) {
+						if (!utf8enc(&out, &rem, hex(in[1]) * 16 + hex(in[2]))) {
+							s->off += (in - buf);
+
+							return -UT_ERROR_OVERLONG_STRING;
+						}
 
-					if (code > 255) {
+						in += 3;
+					}
+					else {
 						s->off += (in - buf);
 
 						return -UT_ERROR_INVALID_ESCAPE;
 					}
+				}
 
-					if (!utf8enc(&out, &rem, code)) {
-						s->off += (in - buf);
+				/* \377, \77 or \7 */
+				else if (in[0] >= '0' && in[0] <= '7') {
+					if (lead_surrogate) {
+						if (!utf8enc(&out, &rem, 0xFFFD)) {
+							s->off += (in - buf);
 
-						return -UT_ERROR_OVERLONG_STRING;
+							return -UT_ERROR_OVERLONG_STRING;
+						}
+
+						lead_surrogate = 0;
 					}
 
-					in += 3;
-				}
+					/* \377 */
+					if (in[1] >= '0' && in[1] <= '7' &&
+					    in[2] >= '0' && in[2] <= '7') {
+						code = dec(in[0]) * 8 * 8 +
+						       dec(in[1]) * 8 +
+						       dec(in[2]);
 
-				/* \77 */
-				else if (in[1] >= '0' && in[1] <= '7') {
-					if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1]))) {
-						s->off += (in - buf);
+						if (code > 255) {
+							s->off += (in - buf);
 
-						return -UT_ERROR_OVERLONG_STRING;
+							return -UT_ERROR_INVALID_ESCAPE;
+						}
+
+						if (!utf8enc(&out, &rem, code)) {
+							s->off += (in - buf);
+
+							return -UT_ERROR_OVERLONG_STRING;
+						}
+
+						in += 3;
+					}
+
+					/* \77 */
+					else if (in[1] >= '0' && in[1] <= '7') {
+						if (!utf8enc(&out, &rem, dec(in[0]) * 8 + dec(in[1]))) {
+							s->off += (in - buf);
+
+							return -UT_ERROR_OVERLONG_STRING;
+						}
+
+						in += 2;
 					}
 
-					in += 2;
+					/* \7 */
+					else {
+						if (!utf8enc(&out, &rem, dec(in[0]))) {
+							s->off += (in - buf);
+
+							return -UT_ERROR_OVERLONG_STRING;
+						}
+
+						in += 1;
+					}
 				}
 
-				/* \7 */
+				/* single character escape */
 				else {
-					if (!utf8enc(&out, &rem, dec(in[0]))) {
-						s->off += (in - buf);
+					if (lead_surrogate) {
+						if (!utf8enc(&out, &rem, 0xFFFD)) {
+							s->off += (in - buf);
 
-						return -UT_ERROR_OVERLONG_STRING;
+							return -UT_ERROR_OVERLONG_STRING;
+						}
+
+						lead_surrogate = 0;
 					}
 
-					in += 1;
-				}
-			}
+					if (rem-- < 1) {
+						s->off += (in - buf);
 
-			/* single character escape */
-			else {
-				if (rem-- < 1) {
-					s->off += (in - buf);
+						return -UT_ERROR_OVERLONG_STRING;
+					}
 
-					return -UT_ERROR_OVERLONG_STRING;
-				}
+					switch (in[0]) {
+					case 'a': *out = '\a'; break;
+					case 'b': *out = '\b'; break;
+					case 'e': *out = '\e'; break;
+					case 'f': *out = '\f'; break;
+					case 'n': *out = '\n'; break;
+					case 'r': *out = '\r'; break;
+					case 't': *out = '\t'; break;
+					case 'v': *out = '\v'; break;
+					default:
+						*out = *in;
+						break;
+					}
 
-				switch (in[0]) {
-				case 'a': *out = '\a'; break;
-				case 'b': *out = '\b'; break;
-				case 'e': *out = '\e'; break;
-				case 'f': *out = '\f'; break;
-				case 'n': *out = '\n'; break;
-				case 'r': *out = '\r'; break;
-				case 't': *out = '\t'; break;
-				case 'v': *out = '\v'; break;
-				default:
-					*out = *in;
-					break;
+					in++;
+					out++;
 				}
-
-				in++;
-				out++;
 			}
 
 			esc = false;
+			continue;
 		}
 
 		/* begin of escape sequence */
-		else if (*in == '\\') {
+		if (*in == '\\') {
 			in++;
 			esc = true;
+			continue;
+		}
+
+
+		/* there's a non-escape following a previous leading unicode surrogate,
+		 * ignore surrogate and emit replacement char */
+		if (lead_surrogate) {
+			if (!utf8enc(&out, &rem, 0xFFFD)) {
+				s->off += (in - buf);
+
+				return -UT_ERROR_OVERLONG_STRING;
+			}
+
+			lead_surrogate = 0;
 		}
 
+
 		/* terminating quote */
-		else if (*in == q) {
+		if (*in == q) {
 			op->val = json_object_new_string_len(str, sizeof(str) - 1 - rem);
 
 			return (in - buf) + 2;
 		}
 
 		/* ordinary char */
-		else {
-			if (rem-- < 1) {
-				s->off += (in - buf);
-
-				return -UT_ERROR_OVERLONG_STRING;
-			}
+		if (rem-- < 1) {
+			s->off += (in - buf);
 
-			*out++ = *in++;
+			return -UT_ERROR_OVERLONG_STRING;
 		}
+
+		*out++ = *in++;
 	}
 
 	return -UT_ERROR_UNTERMINATED_STRING;
author	Jo-Philipp Wich <jo@mein.io>	2020-09-07 11:52:21 +0200
committer	Jo-Philipp Wich <jo@mein.io>	2020-09-07 16:30:34 +0200
commit	c64877172c13b89f40d310e27552bb80e1ea230f (patch)
tree	3f19084f44a0005ebaa64d396256d1994d959b47 /lexer.c
parent	3886dd097f07a3b57dfa884a35e2ae0e41b1b864 (diff)