aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDamian Okrasa <dokrasa@gmail.com>2014-03-25 20:20:26 +0100
committerRoberto E. Vargas Caballero <k0ga@shike2.com>2014-03-27 07:19:37 +0100
commit45b808b88ee63f21a188800ba3473a24a3c4b987 (patch)
treee4f5285e5b27a438c770aa4d794823e027cccac3
parent71328cbcdc88f4fdfbb62d8c0324938e245c8971 (diff)
downloadst-patched-45b808b88ee63f21a188800ba3473a24a3c4b987.tar.bz2
st-patched-45b808b88ee63f21a188800ba3473a24a3c4b987.tar.xz
st-patched-45b808b88ee63f21a188800ba3473a24a3c4b987.zip
new utf decoder
This patch replaces current utf decoder with a new one, which is ~50 lines shorter and should be easier to understand. Parsing 5 and 6 sequences, if necessary, requires trivial modification of UTF_SIZ constant and utfbyte, utfmask, utfmin, utfmax arrays.
-rw-r--r--st.c214
1 files changed, 81 insertions, 133 deletions
diff --git a/st.c b/st.c
index 69b2491..e20a1e0 100644
--- a/st.c
+++ b/st.c
@@ -55,6 +55,7 @@ char *argv0;
55#define XEMBED_FOCUS_OUT 5 55#define XEMBED_FOCUS_OUT 5
56 56
57/* Arbitrary sizes */ 57/* Arbitrary sizes */
58#define UTF_INVALID 0xFFFD
58#define UTF_SIZ 4 59#define UTF_SIZ 4
59#define ESC_BUF_SIZ (128*UTF_SIZ) 60#define ESC_BUF_SIZ (128*UTF_SIZ)
60#define ESC_ARG_SIZ 16 61#define ESC_ARG_SIZ 16
@@ -442,10 +443,12 @@ static void selcopy(void);
442static void selscroll(int, int); 443static void selscroll(int, int);
443static void selsnap(int, int *, int *, int); 444static void selsnap(int, int *, int *, int);
444 445
445static int utf8decode(char *, long *); 446static size_t utf8decode(char *, long *, size_t);
446static int utf8encode(long *, char *); 447static long utf8decodebyte(char, size_t *);
447static int utf8size(char *); 448static size_t utf8encode(long, char *, size_t);
448static int isfullutf8(char *, int); 449static char utf8encodebyte(long, size_t);
450static size_t utf8len(char *);
451static size_t utf8validate(long *, size_t);
449 452
450static ssize_t xwrite(int, char *, size_t); 453static ssize_t xwrite(int, char *, size_t);
451static void *xmalloc(size_t); 454static void *xmalloc(size_t);
@@ -490,6 +493,11 @@ static int oldbutton = 3; /* button event on startup: 3 = release */
490static char *usedfont = NULL; 493static char *usedfont = NULL;
491static double usedfontsize = 0; 494static double usedfontsize = 0;
492 495
496static uchar utfbyte[UTF_SIZ + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
497static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
498static long utfmin[UTF_SIZ + 1] = { 0, 0, 0x80, 0x800, 0x10000};
499static long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
500
493/* Font Ring Cache */ 501/* Font Ring Cache */
494enum { 502enum {
495 FRC_NORMAL, 503 FRC_NORMAL,
@@ -549,128 +557,69 @@ xstrdup(char *s) {
549 return p; 557 return p;
550} 558}
551 559
552int 560size_t
553utf8decode(char *s, long *u) { 561utf8decode(char *c, long *u, size_t clen) {
554 uchar c; 562 size_t i, j, len, type;
555 int i, n, rtn; 563 long udecoded;
556
557 rtn = 1;
558 c = *s;
559 if(~c & 0x80) { /* 0xxxxxxx */
560 *u = c;
561 return rtn;
562 } else if((c & 0xE0) == 0xC0) { /* 110xxxxx */
563 *u = c & 0x1F;
564 n = 1;
565 } else if((c & 0xF0) == 0xE0) { /* 1110xxxx */
566 *u = c & 0x0F;
567 n = 2;
568 } else if((c & 0xF8) == 0xF0) { /* 11110xxx */
569 *u = c & 0x07;
570 n = 3;
571 } else {
572 goto invalid;
573 }
574
575 for(i = n, ++s; i > 0; --i, ++rtn, ++s) {
576 c = *s;
577 if((c & 0xC0) != 0x80) /* 10xxxxxx */
578 goto invalid;
579 *u <<= 6;
580 *u |= c & 0x3F;
581 }
582
583 if((n == 1 && *u < 0x80) ||
584 (n == 2 && *u < 0x800) ||
585 (n == 3 && *u < 0x10000) ||
586 (*u >= 0xD800 && *u <= 0xDFFF)) {
587 goto invalid;
588 }
589
590 return rtn;
591invalid:
592 *u = 0xFFFD;
593
594 return rtn;
595}
596 564
597int 565 *u = UTF_INVALID;
598utf8encode(long *u, char *s) { 566 if(!clen)
599 uchar *sp; 567 return 0;
600 ulong uc; 568 udecoded = utf8decodebyte(c[0], &len);
601 int i, n; 569 if(!BETWEEN(len, 1, UTF_SIZ))
602
603 sp = (uchar *)s;
604 uc = *u;
605 if(uc < 0x80) {
606 *sp = uc; /* 0xxxxxxx */
607 return 1; 570 return 1;
608 } else if(*u < 0x800) { 571 for(i = 1, j = 1; i < clen && j < len; ++i, ++j) {
609 *sp = (uc >> 6) | 0xC0; /* 110xxxxx */ 572 udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
610 n = 1; 573 if(type != 0)
611 } else if(uc < 0x10000) { 574 return j;
612 *sp = (uc >> 12) | 0xE0; /* 1110xxxx */
613 n = 2;
614 } else if(uc <= 0x10FFFF) {
615 *sp = (uc >> 18) | 0xF0; /* 11110xxx */
616 n = 3;
617 } else {
618 goto invalid;
619 } 575 }
576 if(j < len)
577 return 0;
578 *u = udecoded;
579 utf8validate(u, len);
580 return len;
581}
620 582
621 for(i=n,++sp; i>0; --i,++sp) 583long
622 *sp = ((uc >> 6*(i-1)) & 0x3F) | 0x80; /* 10xxxxxx */ 584utf8decodebyte(char c, size_t *i) {
623 585 for(*i = 0; *i < LEN(utfmask); ++(*i))
624 return n+1; 586 if(((uchar)c & utfmask[*i]) == utfbyte[*i])
625invalid: 587 return (uchar)c & ~utfmask[*i];
626 /* U+FFFD */ 588 return 0;
627 *s++ = '\xEF';
628 *s++ = '\xBF';
629 *s = '\xBD';
630
631 return 3;
632} 589}
633 590
634/* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode 591size_t
635 UTF-8 otherwise return 0 */ 592utf8encode(long u, char *c, size_t clen) {
636int 593 size_t len, i;
637isfullutf8(char *s, int b) {
638 uchar *c1, *c2, *c3;
639 594
640 c1 = (uchar *)s; 595 len = utf8validate(&u, 0);
641 c2 = (uchar *)++s; 596 if(clen < len)
642 c3 = (uchar *)++s;
643 if(b < 1) {
644 return 0; 597 return 0;
645 } else if((*c1 & 0xE0) == 0xC0 && b == 1) { 598 for(i = len - 1; i != 0; --i) {
646 return 0; 599 c[i] = utf8encodebyte(u, 0);
647 } else if((*c1 & 0xF0) == 0xE0 && 600 u >>= 6;
648 ((b == 1) ||
649 ((b == 2) && (*c2 & 0xC0) == 0x80))) {
650 return 0;
651 } else if((*c1 & 0xF8) == 0xF0 &&
652 ((b == 1) ||
653 ((b == 2) && (*c2 & 0xC0) == 0x80) ||
654 ((b == 3) && (*c2 & 0xC0) == 0x80 && (*c3 & 0xC0) == 0x80))) {
655 return 0;
656 } else {
657 return 1;
658 } 601 }
602 c[0] = utf8encodebyte(u, len);
603 return len;
659} 604}
660 605
661int 606char
662utf8size(char *s) { 607utf8encodebyte(long u, size_t i) {
663 uchar c = *s; 608 return utfbyte[i] | (u & ~utfmask[i]);
609}
664 610
665 if(~c & 0x80) { 611size_t
666 return 1; 612utf8len(char *c) {
667 } else if((c & 0xE0) == 0xC0) { 613 return utf8decode(c, &(long){0}, UTF_SIZ);
668 return 2; 614}
669 } else if((c & 0xF0) == 0xE0) { 615
670 return 3; 616size_t
671 } else { 617utf8validate(long *u, size_t i) {
672 return 4; 618 if(!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
673 } 619 *u = UTF_INVALID;
620 for(i = 1; *u > utfmax[i]; ++i)
621 ;
622 return i;
674} 623}
675 624
676static void 625static void
@@ -984,7 +933,7 @@ getsel(void) {
984 if(!selected(x, y) || (gp->mode & ATTR_WDUMMY)) 933 if(!selected(x, y) || (gp->mode & ATTR_WDUMMY))
985 continue; 934 continue;
986 935
987 size = utf8size(gp->c); 936 size = utf8len(gp->c);
988 memcpy(ptr, gp->c, size); 937 memcpy(ptr, gp->c, size);
989 ptr += size; 938 ptr += size;
990 } 939 }
@@ -1298,7 +1247,7 @@ ttyread(void) {
1298 char *ptr; 1247 char *ptr;
1299 char s[UTF_SIZ]; 1248 char s[UTF_SIZ];
1300 int charsize; /* size of utf8 char in bytes */ 1249 int charsize; /* size of utf8 char in bytes */
1301 long utf8c; 1250 long unicodep;
1302 int ret; 1251 int ret;
1303 1252
1304 /* append read bytes to unprocessed bytes */ 1253 /* append read bytes to unprocessed bytes */
@@ -1308,9 +1257,8 @@ ttyread(void) {
1308 /* process every complete utf8 char */ 1257 /* process every complete utf8 char */
1309 buflen += ret; 1258 buflen += ret;
1310 ptr = buf; 1259 ptr = buf;
1311 while(buflen >= UTF_SIZ || isfullutf8(ptr,buflen)) { 1260 while(charsize = utf8decode(ptr, &unicodep, buflen)) {
1312 charsize = utf8decode(ptr, &utf8c); 1261 utf8encode(unicodep, s, UTF_SIZ);
1313 utf8encode(&utf8c, s);
1314 tputc(s, charsize); 1262 tputc(s, charsize);
1315 ptr += charsize; 1263 ptr += charsize;
1316 buflen -= charsize; 1264 buflen -= charsize;
@@ -2414,14 +2362,14 @@ void
2414tputc(char *c, int len) { 2362tputc(char *c, int len) {
2415 uchar ascii = *c; 2363 uchar ascii = *c;
2416 bool control = ascii < '\x20' || ascii == 0177; 2364 bool control = ascii < '\x20' || ascii == 0177;
2417 long u8char; 2365 long unicodep;
2418 int width; 2366 int width;
2419 2367
2420 if(len == 1) { 2368 if(len == 1) {
2421 width = 1; 2369 width = 1;
2422 } else { 2370 } else {
2423 utf8decode(c, &u8char); 2371 utf8decode(c, &unicodep, UTF_SIZ);
2424 width = wcwidth(u8char); 2372 width = wcwidth(unicodep);
2425 } 2373 }
2426 2374
2427 if(IS_SET(MODE_PRINT)) 2375 if(IS_SET(MODE_PRINT))
@@ -3150,7 +3098,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
3150 int frcflags; 3098 int frcflags;
3151 int u8fl, u8fblen, u8cblen, doesexist; 3099 int u8fl, u8fblen, u8cblen, doesexist;
3152 char *u8c, *u8fs; 3100 char *u8c, *u8fs;
3153 long u8char; 3101 long unicodep;
3154 Font *font = &dc.font; 3102 Font *font = &dc.font;
3155 FcResult fcres; 3103 FcResult fcres;
3156 FcPattern *fcpattern, *fontpattern; 3104 FcPattern *fcpattern, *fontpattern;
@@ -3293,11 +3241,11 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
3293 oneatatime = font->width != xw.cw; 3241 oneatatime = font->width != xw.cw;
3294 for(;;) { 3242 for(;;) {
3295 u8c = s; 3243 u8c = s;
3296 u8cblen = utf8decode(s, &u8char); 3244 u8cblen = utf8decode(s, &unicodep, UTF_SIZ);
3297 s += u8cblen; 3245 s += u8cblen;
3298 bytelen -= u8cblen; 3246 bytelen -= u8cblen;
3299 3247
3300 doesexist = XftCharExists(xw.dpy, font->match, u8char); 3248 doesexist = XftCharExists(xw.dpy, font->match, unicodep);
3301 if(oneatatime || !doesexist || bytelen <= 0) { 3249 if(oneatatime || !doesexist || bytelen <= 0) {
3302 if(oneatatime || bytelen <= 0) { 3250 if(oneatatime || bytelen <= 0) {
3303 if(doesexist) { 3251 if(doesexist) {
@@ -3329,7 +3277,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
3329 3277
3330 /* Search the font cache. */ 3278 /* Search the font cache. */
3331 for(i = 0; i < frclen; i++) { 3279 for(i = 0; i < frclen; i++) {
3332 if(XftCharExists(xw.dpy, frc[i].font, u8char) 3280 if(XftCharExists(xw.dpy, frc[i].font, unicodep)
3333 && frc[i].flags == frcflags) { 3281 && frc[i].flags == frcflags) {
3334 break; 3282 break;
3335 } 3283 }
@@ -3351,7 +3299,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
3351 fcpattern = FcPatternDuplicate(font->pattern); 3299 fcpattern = FcPatternDuplicate(font->pattern);
3352 fccharset = FcCharSetCreate(); 3300 fccharset = FcCharSetCreate();
3353 3301
3354 FcCharSetAddChar(fccharset, u8char); 3302 FcCharSetAddChar(fccharset, unicodep);
3355 FcPatternAddCharSet(fcpattern, FC_CHARSET, 3303 FcPatternAddCharSet(fcpattern, FC_CHARSET,
3356 fccharset); 3304 fccharset);
3357 FcPatternAddBool(fcpattern, FC_SCALABLE, 3305 FcPatternAddBool(fcpattern, FC_SCALABLE,
@@ -3387,7 +3335,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) {
3387 xp, winy + frc[i].font->ascent, 3335 xp, winy + frc[i].font->ascent,
3388 (FcChar8 *)u8c, u8cblen); 3336 (FcChar8 *)u8c, u8cblen);
3389 3337
3390 xp += xw.cw * wcwidth(u8char); 3338 xp += xw.cw * wcwidth(unicodep);
3391 } 3339 }
3392 3340
3393 /* 3341 /*
@@ -3430,7 +3378,7 @@ xdrawcursor(void) {
3430 memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ); 3378 memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ);
3431 3379
3432 /* remove the old cursor */ 3380 /* remove the old cursor */
3433 sl = utf8size(term.line[oldy][oldx].c); 3381 sl = utf8len(term.line[oldy][oldx].c);
3434 width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1; 3382 width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1;
3435 xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx, 3383 xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx,
3436 oldy, width, sl); 3384 oldy, width, sl);
@@ -3444,7 +3392,7 @@ xdrawcursor(void) {
3444 g.bg = defaultfg; 3392 g.bg = defaultfg;
3445 } 3393 }
3446 3394
3447 sl = utf8size(g.c); 3395 sl = utf8len(g.c);
3448 width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\ 3396 width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\
3449 ? 2 : 1; 3397 ? 2 : 1;
3450 xdraws(g.c, g, term.c.x, term.c.y, width, sl); 3398 xdraws(g.c, g, term.c.x, term.c.y, width, sl);
@@ -3516,7 +3464,7 @@ drawregion(int x1, int y1, int x2, int y2) {
3516 Glyph base, new; 3464 Glyph base, new;
3517 char buf[DRAW_BUF_SIZ]; 3465 char buf[DRAW_BUF_SIZ];
3518 bool ena_sel = sel.ob.x != -1; 3466 bool ena_sel = sel.ob.x != -1;
3519 long u8char; 3467 long unicodep;
3520 3468
3521 if(sel.alt ^ IS_SET(MODE_ALTSCREEN)) 3469 if(sel.alt ^ IS_SET(MODE_ALTSCREEN))
3522 ena_sel = 0; 3470 ena_sel = 0;
@@ -3548,7 +3496,7 @@ drawregion(int x1, int y1, int x2, int y2) {
3548 base = new; 3496 base = new;
3549 } 3497 }
3550 3498
3551 sl = utf8decode(new.c, &u8char); 3499 sl = utf8decode(new.c, &unicodep, UTF_SIZ);
3552 memcpy(buf+ib, new.c, sl); 3500 memcpy(buf+ib, new.c, sl);
3553 ib += sl; 3501 ib += sl;
3554 ic += (new.mode & ATTR_WIDE)? 2 : 1; 3502 ic += (new.mode & ATTR_WIDE)? 2 : 1;
@@ -3707,7 +3655,7 @@ kpress(XEvent *ev) {
3707 if(IS_SET(MODE_8BIT)) { 3655 if(IS_SET(MODE_8BIT)) {
3708 if(*buf < 0177) { 3656 if(*buf < 0177) {
3709 c = *buf | 0x80; 3657 c = *buf | 0x80;
3710 len = utf8encode(&c, buf); 3658 len = utf8encode(c, buf, UTF_SIZ);
3711 } 3659 }
3712 } else { 3660 } else {
3713 buf[1] = buf[0]; 3661 buf[1] = buf[0];