[go: up one dir, main page]

File: unicode.c

package info (click to toggle)
console-tools 1998.08.11-3
  • links: PTS
  • area: main
  • in suites: slink
  • size: 5,240 kB
  • ctags: 805
  • sloc: ansic: 8,241; sh: 2,947; yacc: 970; makefile: 356; lex: 287; pascal: 192; perl: 110
file content (103 lines) | stat: -rw-r--r-- 2,045 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#include <lct/unicode.h>

/*
 * Convert a UCS2 char into the equivalent UTF8 sequence, with
 * trailing zero.
 * Adapted from Linux kernel 2.0.30
 */
void ucs2_to_utf8(unicode c, char* utf)
{
  if (c < 0x80)
    {
      utf[0] = c;			/*  0*******  */
      utf[1] = 0;
    }
  else if (c < 0x800) 
    {
      utf[0] = 0xc0 | (c >> 6); 	/*  110***** 10******  */
      utf[1] = 0x80 | (c & 0x3f);
      utf[2] = 0;
    } 
  else
    {
      utf[0] = 0xe0 | (c >> 12); 	/*  1110**** 10****** 10******  */
      utf[1] = 0x80 | ((c >> 6) & 0x3f);
      utf[2] = 0x80 | (c & 0x3f);
      utf[3] = 0;
    }
  /* UTF-8 is defined for words of up to 31 bits,
     but we need only 16 bits here */
}


/* Combine UTF-8 into Unicode */
/* Incomplete characters silently ignored */
unicode utf8_to_ucs2 (char* buf)
{
  int utf_count = 0;
  long utf_char;
  unicode tc;
  unsigned char c;
  
  do
    {
      c = *buf;
      buf++;
      
      /* if byte should be part of multi-byte sequence */
      if(c & 0x80)
	{
	  /* if we have already started to parse a UTF8 sequence */
	  if (utf_count > 0 && (c & 0xc0) == 0x80)
	    {
	      utf_char = (utf_char << 6) | (c & 0x3f);
	      utf_count--;
	      if (utf_count == 0)
		  tc = utf_char;
	      else
		  continue;
	    } 
	  else	/* Possibly 1st char of a UTF8 sequence */
	    {
	      if ((c & 0xe0) == 0xc0) 
		{
		  utf_count = 1;
		  utf_char = (c & 0x1f);
		} 
	      else if ((c & 0xf0) == 0xe0) 
		{
		  utf_count = 2;
		  utf_char = (c & 0x0f);
		} 
	      else if ((c & 0xf8) == 0xf0) 
		{
		  utf_count = 3;
		  utf_char = (c & 0x07);
		} 
	      else if ((c & 0xfc) == 0xf8) 
		{
		  utf_count = 4;
		  utf_char = (c & 0x03);
		} 
	      else if ((c & 0xfe) == 0xfc) 
		{
		  utf_count = 5;
		  utf_char = (c & 0x01);
		} 
	      else
		  utf_count = 0;
	      continue;
	    }
	} 
      else /* not part of multi-byte sequence - treat as ASCII
	    * this makes incomplete sequences to be ignored
	    */
	{
	  tc = c;
	  utf_count = 0;
	}
    }
  while (utf_count);
  
  return tc;
}