diff options
author | Damian Okrasa <dokrasa@gmail.com> | 2014-03-25 20:20:26 +0100 |
---|---|---|
committer | Roberto E. Vargas Caballero <k0ga@shike2.com> | 2014-03-27 07:19:37 +0100 |
commit | 45b808b88ee63f21a188800ba3473a24a3c4b987 (patch) | |
tree | e4f5285e5b27a438c770aa4d794823e027cccac3 | |
parent | 71328cbcdc88f4fdfbb62d8c0324938e245c8971 (diff) | |
download | st-patched-45b808b88ee63f21a188800ba3473a24a3c4b987.tar.bz2 st-patched-45b808b88ee63f21a188800ba3473a24a3c4b987.tar.xz st-patched-45b808b88ee63f21a188800ba3473a24a3c4b987.zip |
new utf decoder
This patch replaces current utf decoder with a new one, which is ~50
lines shorter and should be easier to understand. Parsing 5 and 6
sequences, if necessary, requires trivial modification of UTF_SIZ
constant and utfbyte, utfmask, utfmin, utfmax arrays.
-rw-r--r-- | st.c | 214 |
1 files changed, 81 insertions, 133 deletions
@@ -55,6 +55,7 @@ char *argv0; | |||
55 | #define XEMBED_FOCUS_OUT 5 | 55 | #define XEMBED_FOCUS_OUT 5 |
56 | 56 | ||
57 | /* Arbitrary sizes */ | 57 | /* Arbitrary sizes */ |
58 | #define UTF_INVALID 0xFFFD | ||
58 | #define UTF_SIZ 4 | 59 | #define UTF_SIZ 4 |
59 | #define ESC_BUF_SIZ (128*UTF_SIZ) | 60 | #define ESC_BUF_SIZ (128*UTF_SIZ) |
60 | #define ESC_ARG_SIZ 16 | 61 | #define ESC_ARG_SIZ 16 |
@@ -442,10 +443,12 @@ static void selcopy(void); | |||
442 | static void selscroll(int, int); | 443 | static void selscroll(int, int); |
443 | static void selsnap(int, int *, int *, int); | 444 | static void selsnap(int, int *, int *, int); |
444 | 445 | ||
445 | static int utf8decode(char *, long *); | 446 | static size_t utf8decode(char *, long *, size_t); |
446 | static int utf8encode(long *, char *); | 447 | static long utf8decodebyte(char, size_t *); |
447 | static int utf8size(char *); | 448 | static size_t utf8encode(long, char *, size_t); |
448 | static int isfullutf8(char *, int); | 449 | static char utf8encodebyte(long, size_t); |
450 | static size_t utf8len(char *); | ||
451 | static size_t utf8validate(long *, size_t); | ||
449 | 452 | ||
450 | static ssize_t xwrite(int, char *, size_t); | 453 | static ssize_t xwrite(int, char *, size_t); |
451 | static void *xmalloc(size_t); | 454 | static void *xmalloc(size_t); |
@@ -490,6 +493,11 @@ static int oldbutton = 3; /* button event on startup: 3 = release */ | |||
490 | static char *usedfont = NULL; | 493 | static char *usedfont = NULL; |
491 | static double usedfontsize = 0; | 494 | static double usedfontsize = 0; |
492 | 495 | ||
496 | static uchar utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0}; | ||
497 | static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; | ||
498 | static long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000}; | ||
499 | static long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; | ||
500 | |||
493 | /* Font Ring Cache */ | 501 | /* Font Ring Cache */ |
494 | enum { | 502 | enum { |
495 | FRC_NORMAL, | 503 | FRC_NORMAL, |
@@ -549,128 +557,69 @@ xstrdup(char *s) { | |||
549 | return p; | 557 | return p; |
550 | } | 558 | } |
551 | 559 | ||
552 | int | 560 | size_t |
553 | utf8decode(char *s, long *u) { | 561 | utf8decode(char *c, long *u, size_t clen) { |
554 | uchar c; | 562 | size_t i, j, len, type; |
555 | int i, n, rtn; | 563 | long udecoded; |
556 | |||
557 | rtn = 1; | ||
558 | c = *s; | ||
559 | if(~c & 0x80) { /* 0xxxxxxx */ | ||
560 | *u = c; | ||
561 | return rtn; | ||
562 | } else if((c & 0xE0) == 0xC0) { /* 110xxxxx */ | ||
563 | *u = c & 0x1F; | ||
564 | n = 1; | ||
565 | } else if((c & 0xF0) == 0xE0) { /* 1110xxxx */ | ||
566 | *u = c & 0x0F; | ||
567 | n = 2; | ||
568 | } else if((c & 0xF8) == 0xF0) { /* 11110xxx */ | ||
569 | *u = c & 0x07; | ||
570 | n = 3; | ||
571 | } else { | ||
572 | goto invalid; | ||
573 | } | ||
574 | |||
575 | for(i = n, ++s; i > 0; --i, ++rtn, ++s) { | ||
576 | c = *s; | ||
577 | if((c & 0xC0) != 0x80) /* 10xxxxxx */ | ||
578 | goto invalid; | ||
579 | *u <<= 6; | ||
580 | *u |= c & 0x3F; | ||
581 | } | ||
582 | |||
583 | if((n == 1 && *u < 0x80) || | ||
584 | (n == 2 && *u < 0x800) || | ||
585 | (n == 3 && *u < 0x10000) || | ||
586 | (*u >= 0xD800 && *u <= 0xDFFF)) { | ||
587 | goto invalid; | ||
588 | } | ||
589 | |||
590 | return rtn; | ||
591 | invalid: | ||
592 | *u = 0xFFFD; | ||
593 | |||
594 | return rtn; | ||
595 | } | ||
596 | 564 | ||
597 | int | 565 | *u = UTF_INVALID; |
598 | utf8encode(long *u, char *s) { | 566 | if(!clen) |
599 | uchar *sp; | 567 | return 0; |
600 | ulong uc; | 568 | udecoded = utf8decodebyte(c[0], &len); |
601 | int i, n; | 569 | if(!BETWEEN(len, 1, UTF_SIZ)) |
602 | |||
603 | sp = (uchar *)s; | ||
604 | uc = *u; | ||
605 | if(uc < 0x80) { | ||
606 | *sp = uc; /* 0xxxxxxx */ | ||
607 | return 1; | 570 | return 1; |
608 | } else if(*u < 0x800) { | 571 | for(i = 1, j = 1; i < clen && j < len; ++i, ++j) { |
609 | *sp = (uc >> 6) | 0xC0; /* 110xxxxx */ | 572 | udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type); |
610 | n = 1; | 573 | if(type != 0) |
611 | } else if(uc < 0x10000) { | 574 | return j; |
612 | *sp = (uc >> 12) | 0xE0; /* 1110xxxx */ | ||
613 | n = 2; | ||
614 | } else if(uc <= 0x10FFFF) { | ||
615 | *sp = (uc >> 18) | 0xF0; /* 11110xxx */ | ||
616 | n = 3; | ||
617 | } else { | ||
618 | goto invalid; | ||
619 | } | 575 | } |
576 | if(j < len) | ||
577 | return 0; | ||
578 | *u = udecoded; | ||
579 | utf8validate(u, len); | ||
580 | return len; | ||
581 | } | ||
620 | 582 | ||
621 | for(i=n,++sp; i>0; --i,++sp) | 583 | long |
622 | *sp = ((uc >> 6*(i-1)) & 0x3F) | 0x80; /* 10xxxxxx */ | 584 | utf8decodebyte(char c, size_t *i) { |
623 | 585 | for(*i = 0; *i < LEN(utfmask); ++(*i)) | |
624 | return n+1; | 586 | if(((uchar)c & utfmask[*i]) == utfbyte[*i]) |
625 | invalid: | 587 | return (uchar)c & ~utfmask[*i]; |
626 | /* U+FFFD */ | 588 | return 0; |
627 | *s++ = '\xEF'; | ||
628 | *s++ = '\xBF'; | ||
629 | *s = '\xBD'; | ||
630 | |||
631 | return 3; | ||
632 | } | 589 | } |
633 | 590 | ||
634 | /* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode | 591 | size_t |
635 | UTF-8 otherwise return 0 */ | 592 | utf8encode(long u, char *c, size_t clen) { |
636 | int | 593 | size_t len, i; |
637 | isfullutf8(char *s, int b) { | ||
638 | uchar *c1, *c2, *c3; | ||
639 | 594 | ||
640 | c1 = (uchar *)s; | 595 | len = utf8validate(&u, 0); |
641 | c2 = (uchar *)++s; | 596 | if(clen < len) |
642 | c3 = (uchar *)++s; | ||
643 | if(b < 1) { | ||
644 | return 0; | 597 | return 0; |
645 | } else if((*c1 & 0xE0) == 0xC0 && b == 1) { | 598 | for(i = len - 1; i != 0; --i) { |
646 | return 0; | 599 | c[i] = utf8encodebyte(u, 0); |
647 | } else if((*c1 & 0xF0) == 0xE0 && | 600 | u >>= 6; |
648 | ((b == 1) || | ||
649 | ((b == 2) && (*c2 & 0xC0) == 0x80))) { | ||
650 | return 0; | ||
651 | } else if((*c1 & 0xF8) == 0xF0 && | ||
652 | ((b == 1) || | ||
653 | ((b == 2) && (*c2 & 0xC0) == 0x80) || | ||
654 | ((b == 3) && (*c2 & 0xC0) == 0x80 && (*c3 & 0xC0) == 0x80))) { | ||
655 | return 0; | ||
656 | } else { | ||
657 | return 1; | ||
658 | } | 601 | } |
602 | c[0] = utf8encodebyte(u, len); | ||
603 | return len; | ||
659 | } | 604 | } |
660 | 605 | ||
661 | int | 606 | char |
662 | utf8size(char *s) { | 607 | utf8encodebyte(long u, size_t i) { |
663 | uchar c = *s; | 608 | return utfbyte[i] | (u & ~utfmask[i]); |
609 | } | ||
664 | 610 | ||
665 | if(~c & 0x80) { | 611 | size_t |
666 | return 1; | 612 | utf8len(char *c) { |
667 | } else if((c & 0xE0) == 0xC0) { | 613 | return utf8decode(c, &(long){0}, UTF_SIZ); |
668 | return 2; | 614 | } |
669 | } else if((c & 0xF0) == 0xE0) { | 615 | |
670 | return 3; | 616 | size_t |
671 | } else { | 617 | utf8validate(long *u, size_t i) { |
672 | return 4; | 618 | if(!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) |
673 | } | 619 | *u = UTF_INVALID; |
620 | for(i = 1; *u > utfmax[i]; ++i) | ||
621 | ; | ||
622 | return i; | ||
674 | } | 623 | } |
675 | 624 | ||
676 | static void | 625 | static void |
@@ -984,7 +933,7 @@ getsel(void) { | |||
984 | if(!selected(x, y) || (gp->mode & ATTR_WDUMMY)) | 933 | if(!selected(x, y) || (gp->mode & ATTR_WDUMMY)) |
985 | continue; | 934 | continue; |
986 | 935 | ||
987 | size = utf8size(gp->c); | 936 | size = utf8len(gp->c); |
988 | memcpy(ptr, gp->c, size); | 937 | memcpy(ptr, gp->c, size); |
989 | ptr += size; | 938 | ptr += size; |
990 | } | 939 | } |
@@ -1298,7 +1247,7 @@ ttyread(void) { | |||
1298 | char *ptr; | 1247 | char *ptr; |
1299 | char s[UTF_SIZ]; | 1248 | char s[UTF_SIZ]; |
1300 | int charsize; /* size of utf8 char in bytes */ | 1249 | int charsize; /* size of utf8 char in bytes */ |
1301 | long utf8c; | 1250 | long unicodep; |
1302 | int ret; | 1251 | int ret; |
1303 | 1252 | ||
1304 | /* append read bytes to unprocessed bytes */ | 1253 | /* append read bytes to unprocessed bytes */ |
@@ -1308,9 +1257,8 @@ ttyread(void) { | |||
1308 | /* process every complete utf8 char */ | 1257 | /* process every complete utf8 char */ |
1309 | buflen += ret; | 1258 | buflen += ret; |
1310 | ptr = buf; | 1259 | ptr = buf; |
1311 | while(buflen >= UTF_SIZ || isfullutf8(ptr,buflen)) { | 1260 | while(charsize = utf8decode(ptr, &unicodep, buflen)) { |
1312 | charsize = utf8decode(ptr, &utf8c); | 1261 | utf8encode(unicodep, s, UTF_SIZ); |
1313 | utf8encode(&utf8c, s); | ||
1314 | tputc(s, charsize); | 1262 | tputc(s, charsize); |
1315 | ptr += charsize; | 1263 | ptr += charsize; |
1316 | buflen -= charsize; | 1264 | buflen -= charsize; |
@@ -2414,14 +2362,14 @@ void | |||
2414 | tputc(char *c, int len) { | 2362 | tputc(char *c, int len) { |
2415 | uchar ascii = *c; | 2363 | uchar ascii = *c; |
2416 | bool control = ascii < '\x20' || ascii == 0177; | 2364 | bool control = ascii < '\x20' || ascii == 0177; |
2417 | long u8char; | 2365 | long unicodep; |
2418 | int width; | 2366 | int width; |
2419 | 2367 | ||
2420 | if(len == 1) { | 2368 | if(len == 1) { |
2421 | width = 1; | 2369 | width = 1; |
2422 | } else { | 2370 | } else { |
2423 | utf8decode(c, &u8char); | 2371 | utf8decode(c, &unicodep, UTF_SIZ); |
2424 | width = wcwidth(u8char); | 2372 | width = wcwidth(unicodep); |
2425 | } | 2373 | } |
2426 | 2374 | ||
2427 | if(IS_SET(MODE_PRINT)) | 2375 | if(IS_SET(MODE_PRINT)) |
@@ -3150,7 +3098,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||
3150 | int frcflags; | 3098 | int frcflags; |
3151 | int u8fl, u8fblen, u8cblen, doesexist; | 3099 | int u8fl, u8fblen, u8cblen, doesexist; |
3152 | char *u8c, *u8fs; | 3100 | char *u8c, *u8fs; |
3153 | long u8char; | 3101 | long unicodep; |
3154 | Font *font = &dc.font; | 3102 | Font *font = &dc.font; |
3155 | FcResult fcres; | 3103 | FcResult fcres; |
3156 | FcPattern *fcpattern, *fontpattern; | 3104 | FcPattern *fcpattern, *fontpattern; |
@@ -3293,11 +3241,11 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||
3293 | oneatatime = font->width != xw.cw; | 3241 | oneatatime = font->width != xw.cw; |
3294 | for(;;) { | 3242 | for(;;) { |
3295 | u8c = s; | 3243 | u8c = s; |
3296 | u8cblen = utf8decode(s, &u8char); | 3244 | u8cblen = utf8decode(s, &unicodep, UTF_SIZ); |
3297 | s += u8cblen; | 3245 | s += u8cblen; |
3298 | bytelen -= u8cblen; | 3246 | bytelen -= u8cblen; |
3299 | 3247 | ||
3300 | doesexist = XftCharExists(xw.dpy, font->match, u8char); | 3248 | doesexist = XftCharExists(xw.dpy, font->match, unicodep); |
3301 | if(oneatatime || !doesexist || bytelen <= 0) { | 3249 | if(oneatatime || !doesexist || bytelen <= 0) { |
3302 | if(oneatatime || bytelen <= 0) { | 3250 | if(oneatatime || bytelen <= 0) { |
3303 | if(doesexist) { | 3251 | if(doesexist) { |
@@ -3329,7 +3277,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||
3329 | 3277 | ||
3330 | /* Search the font cache. */ | 3278 | /* Search the font cache. */ |
3331 | for(i = 0; i < frclen; i++) { | 3279 | for(i = 0; i < frclen; i++) { |
3332 | if(XftCharExists(xw.dpy, frc[i].font, u8char) | 3280 | if(XftCharExists(xw.dpy, frc[i].font, unicodep) |
3333 | && frc[i].flags == frcflags) { | 3281 | && frc[i].flags == frcflags) { |
3334 | break; | 3282 | break; |
3335 | } | 3283 | } |
@@ -3351,7 +3299,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||
3351 | fcpattern = FcPatternDuplicate(font->pattern); | 3299 | fcpattern = FcPatternDuplicate(font->pattern); |
3352 | fccharset = FcCharSetCreate(); | 3300 | fccharset = FcCharSetCreate(); |
3353 | 3301 | ||
3354 | FcCharSetAddChar(fccharset, u8char); | 3302 | FcCharSetAddChar(fccharset, unicodep); |
3355 | FcPatternAddCharSet(fcpattern, FC_CHARSET, | 3303 | FcPatternAddCharSet(fcpattern, FC_CHARSET, |
3356 | fccharset); | 3304 | fccharset); |
3357 | FcPatternAddBool(fcpattern, FC_SCALABLE, | 3305 | FcPatternAddBool(fcpattern, FC_SCALABLE, |
@@ -3387,7 +3335,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||
3387 | xp, winy + frc[i].font->ascent, | 3335 | xp, winy + frc[i].font->ascent, |
3388 | (FcChar8 *)u8c, u8cblen); | 3336 | (FcChar8 *)u8c, u8cblen); |
3389 | 3337 | ||
3390 | xp += xw.cw * wcwidth(u8char); | 3338 | xp += xw.cw * wcwidth(unicodep); |
3391 | } | 3339 | } |
3392 | 3340 | ||
3393 | /* | 3341 | /* |
@@ -3430,7 +3378,7 @@ xdrawcursor(void) { | |||
3430 | memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ); | 3378 | memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ); |
3431 | 3379 | ||
3432 | /* remove the old cursor */ | 3380 | /* remove the old cursor */ |
3433 | sl = utf8size(term.line[oldy][oldx].c); | 3381 | sl = utf8len(term.line[oldy][oldx].c); |
3434 | width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1; | 3382 | width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1; |
3435 | xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx, | 3383 | xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx, |
3436 | oldy, width, sl); | 3384 | oldy, width, sl); |
@@ -3444,7 +3392,7 @@ xdrawcursor(void) { | |||
3444 | g.bg = defaultfg; | 3392 | g.bg = defaultfg; |
3445 | } | 3393 | } |
3446 | 3394 | ||
3447 | sl = utf8size(g.c); | 3395 | sl = utf8len(g.c); |
3448 | width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\ | 3396 | width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\ |
3449 | ? 2 : 1; | 3397 | ? 2 : 1; |
3450 | xdraws(g.c, g, term.c.x, term.c.y, width, sl); | 3398 | xdraws(g.c, g, term.c.x, term.c.y, width, sl); |
@@ -3516,7 +3464,7 @@ drawregion(int x1, int y1, int x2, int y2) { | |||
3516 | Glyph base, new; | 3464 | Glyph base, new; |
3517 | char buf[DRAW_BUF_SIZ]; | 3465 | char buf[DRAW_BUF_SIZ]; |
3518 | bool ena_sel = sel.ob.x != -1; | 3466 | bool ena_sel = sel.ob.x != -1; |
3519 | long u8char; | 3467 | long unicodep; |
3520 | 3468 | ||
3521 | if(sel.alt ^ IS_SET(MODE_ALTSCREEN)) | 3469 | if(sel.alt ^ IS_SET(MODE_ALTSCREEN)) |
3522 | ena_sel = 0; | 3470 | ena_sel = 0; |
@@ -3548,7 +3496,7 @@ drawregion(int x1, int y1, int x2, int y2) { | |||
3548 | base = new; | 3496 | base = new; |
3549 | } | 3497 | } |
3550 | 3498 | ||
3551 | sl = utf8decode(new.c, &u8char); | 3499 | sl = utf8decode(new.c, &unicodep, UTF_SIZ); |
3552 | memcpy(buf+ib, new.c, sl); | 3500 | memcpy(buf+ib, new.c, sl); |
3553 | ib += sl; | 3501 | ib += sl; |
3554 | ic += (new.mode & ATTR_WIDE)? 2 : 1; | 3502 | ic += (new.mode & ATTR_WIDE)? 2 : 1; |
@@ -3707,7 +3655,7 @@ kpress(XEvent *ev) { | |||
3707 | if(IS_SET(MODE_8BIT)) { | 3655 | if(IS_SET(MODE_8BIT)) { |
3708 | if(*buf < 0177) { | 3656 | if(*buf < 0177) { |
3709 | c = *buf | 0x80; | 3657 | c = *buf | 0x80; |
3710 | len = utf8encode(&c, buf); | 3658 | len = utf8encode(c, buf, UTF_SIZ); |
3711 | } | 3659 | } |
3712 | } else { | 3660 | } else { |
3713 | buf[1] = buf[0]; | 3661 | buf[1] = buf[0]; |