YAP 7.1.0
getw.h
1
2#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
3
4
5static int post_process_f_weof(StreamDesc *st)
6{
7 if (ferror(st->file)) {
8 clearerr(st->file);
9 return 1;
10 } else {
11 return post_process_weof(st);
12 }
13
14}
15
19extern int get_wchar(int sno) {
20 StreamDesc *st = GLOBAL_Stream + sno;
21 int ch = st->stream_getc(sno);
22
23 if (ch == -1)
24 return post_process_f_weof(st);
25
26 switch (st->encoding) {
27 case ENC_OCTET:
28 // no error detection, all characters are ok.
29 case ENC_ISO_LATIN1:
30 return post_process_read_wchar(ch, 1, st);
31 // 7 bits code, anything above is bad news
32 case ENC_ISO_ASCII:
33 if (ch & 0x80) {
34 /* error */
35 }
36 return post_process_read_wchar(ch, 1, st);
37 // default OS encoding, depends on locale.
38 case ENC_ISO_ANSI: {
39 char buf[8];
40 int out;
41 wchar_t wch;
42 mbstate_t mbstate;
43
44 memset((void *)&(mbstate), 0, sizeof(mbstate_t));
45 buf[0] = ch;
46 int n = 1;
47 while ((out = mbrtowc(&wch, buf, 1, &(mbstate))) != 1) {
48 int ch = buf[0] = st->stream_getc(sno);
49 n++;
50 if (ch == -1)
51 return post_process_weof(st);
52 }
53 return post_process_read_wchar(wch, n, st);
54 }
55 // UTF-8 works o 8 bits.
56 case ENC_ISO_UTF8: {
57 int wch;
58 unsigned char buf[8];
59
60 if (ch < 0x80) {
61 return post_process_read_wchar(ch, 1, st);
62 }
63 if ((ch - 0xc2) > (0xf4-0xc2)) {
64 return Yap_encoding_error(ch, 1, st);
65 }
66 if (ch < 0xe0) { // 2-byte sequence
67 // Must have valid continuation character
68 int c1 = buf[0] = st->stream_getc(sno);
69 if (c1 == -1)
70 return post_process_weof(st);
71 if (!utf_cont(c1)) {
72 return Yap_encoding_error(ch, 2, st);
73 }
74 wch = ((ch & 0x1f) << 6) | (c1 & 0x3f);
75 return post_process_read_wchar(wch, 2, st);
76 }
77 if (ch < 0xf0) { // 3-byte sequence
78 int c1 = st->stream_getc(sno);
79 if (c1 == -1)
80 return post_process_weof(st);
81 // return UTF8PROC_ERROR_INVALIDUTF8;
82 if (ch == 0xed && c1 > 0x9f) {
83 return Yap_encoding_error(ch, 1, st);
84 }
85 int c2 = st->stream_getc(sno);
86 if (c2 == -1)
87 return post_process_weof(st);
88 if ( !utf_cont(c1) || !utf_cont(c2)) {
89 return Yap_encoding_error(ch, 2, st);
90 // Check for surrogate chars
91
92 }
93 wch = ((ch & 0xf) << 12) | ((c1 & 0x3f) << 6) | (c2 & 0x3f);
94 return post_process_read_wchar(wch, 3, st);
95 } else {
96 int c1 = st->stream_getc(sno);
97 if (c1 == -1)
98 return post_process_weof(st);
99 int c2 = st->stream_getc(sno);
100 if (c2 == -1)
101 return post_process_weof(st);
102 int c3 = st->stream_getc(sno);
103 if (c3 == -1)
104 return post_process_weof(st);
105 if ( !utf_cont(c1) || !utf_cont(c2) || !utf_cont(c3)) {
106 return Yap_encoding_error(ch, 3, st);
107 }
108 wch = ((ch & 7) << 18) | ((c1 & 0x3f) << 12) | ((c2 & 0x3f) << 6) |
109 (c3 & 0x3f);
110 return post_process_read_wchar(wch, 4, st);
111 }
112 }
113 case ENC_UTF16_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
114 // big-endian: most significant octet first
115 {
116 unsigned int wch;
117 int c1 = st->stream_getc(sno);
118 if (c1 == -1)
119 return post_process_weof(st);
120 wch = (unsigned int)(c1 << 8) + ch;
121 if (wch >= 0xd800 && wch < 0xdc00) {
122 int c2 = st->stream_getc(sno);
123 if (c2 == -1)
124 return post_process_weof(st);
125 int c3 = st->stream_getc(sno);
126 if (c3 == -1)
127 return post_process_weof(st);
128 wch = wch + ((unsigned int)((unsigned int)((c3 << 8) + c2) << 8) + SURROGATE_OFFSET);
129 return post_process_read_wchar(wch, 4, st);
130 }
131 return post_process_read_wchar(wch, 2, st);
132 }
133
134 case ENC_UTF16_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
135 // little-endian: least significant octet first
136 {
137 unsigned int wch;
138 int c1 = st->stream_getc(sno);
139 if (c1 == -1)
140 return post_process_weof(st);
141 wch = (c1) + (ch << 8);
142
143
144 // printf("%d %c %d %d \n", wch, wch, ch, c1);
145 if (wch >= 0xd800 && wch < 0xdc00) {
146 int c3 = st->stream_getc(sno);
147 if (c3 == -1)
148 return post_process_weof(st);
149 int c2 = st->stream_getc(sno);
150 if (c2 == -1)
151 return post_process_weof(st);
152 wch = (((c3 << 8) + c2) << 8) + wch + SURROGATE_OFFSET;
153 return post_process_read_wchar(wch, 4, st);
154 }
155 return post_process_read_wchar(wch, 2, st);
156 }
157
158 case ENC_UCS2_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
159 // big-endian: most significant byte first
160 {
161 unsigned int wch;
162 int c1 = st->stream_getc(sno);
163 if (c1 == -1)
164 return post_process_weof(st);
165 wch = (c1) + (ch << 8);
166 return post_process_read_wchar(wch, 2, st);
167 }
168
169 case ENC_UCS2_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
170 // little-endian: least significant byte first
171 {
172 unsigned int wch;
173 int c1 = st->stream_getc(sno);
174 if (c1 == -1)
175 return post_process_weof(st);
176 wch = (c1 << 8) + ch;
177
178 return post_process_read_wchar(wch, 2, st);
179 }
180
181 case ENC_ISO_UTF32_BE: // check http://unicode.org/faq/utf_bom.html#utf16-3
182 // big-endian: from most to least significant
183 {
184 unsigned int wch = ch;
185 {
186 int c1 = st->stream_getc(sno);
187 if (c1 == -1)
188 return post_process_weof(st);
189 wch = wch + (unsigned int)c1;
190 }
191 {
192 int c1 = st->stream_getc(sno);
193 if (c1 == -1)
194 return post_process_weof(st);
195 wch = (wch << 8) + (unsigned int)c1;
196 }
197 {
198 int c1 = st->stream_getc(sno);
199 if (c1 == -1)
200 return post_process_weof(st);
201 wch = (wch << 8) + (unsigned int)c1;
202 }
203 return post_process_read_wchar(wch, 4, st);
204 }
205 case ENC_ISO_UTF32_LE: // check http://unicode.org/faq/utf_bom.html#utf16-3
206 // little-endian: from least to most significant
207 {
208 unsigned int wch = ch;
209 {
210 int c1 = st->stream_getc(sno);
211 if (c1 == -1)
212 return post_process_weof(st);
213 wch += (unsigned int)(c1 << 8);
214 }
215 {
216 int c1 = st->stream_getc(sno);
217 if (c1 == -1)
218 return post_process_weof(st);
219 wch += (unsigned int)(c1 << 16);
220 }
221 {
222 int c1 = st->stream_getc(sno);
223 if (c1 == -1)
224 return post_process_weof(st);
225 wch += (unsigned int)(c1 << 24);
226 }
227 return post_process_read_wchar(wch, 4, st);
228 }
229 default:
230 Yap_ThrowError(SYSTEM_ERROR_OPERATING_SYSTEM, MkIntTerm(st->encoding),
231 "Unsupported Encoding %d\n", st->encoding);
232 return -1;
233 }
234}
235
236extern int get_wchar_UTF8(int sno) {
237 StreamDesc *st = GLOBAL_Stream + sno;
238 int ch = st->stream_getc(sno);
239 if (ch == -1)
240 return post_process_weof(st);
241 else {
242 int wch;
243 unsigned char buf[8];
244
245 if (ch < 0x80) {
246 return post_process_read_wchar(ch, 1, st);
247 }
248 // if ((ch - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
249 if (ch < 0xe0) { // 2-byte sequence
250 // Must have valid continuation character
251 int c1 = buf[0] = st->stream_getc(sno);
252 if (c1 == -1)
253 return post_process_weof(st);
254 if (!utf_cont(c1)) {
255 return Yap_encoding_error(ch, 2, st);
256 }
257 wch = ((ch & 0x1f) << 6) | (c1 & 0x3f);
258 return post_process_read_wchar(wch, 2, st);
259 }
260 if (ch < 0xf0) { // 3-byte sequence
261 // if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
262 // return UTF8PROC_ERROR_INVALIDUTF8;
263 // Check for surrogate chars
264 // if (ch == 0xed && *str > 0x9f)
265 // return UTF8PROC_ERROR_INVALIDUTF8;
266 int c1 = st->stream_getc(sno);
267 if (c1 == -1)
268 return post_process_weof(st);
269 if (ch == 0xed && c1 > 0x9f)
270 return Yap_encoding_error(ch, 2, st);
271 int c2 = st->stream_getc(sno);
272 if (c2 == -1)
273 return post_process_weof(st);
274 wch = ((ch & 0xf)<<12) | ((c1 & 0x3f)<<6) | (c2 & 0x3f);
275 if (wch < 0x800)
276 return Yap_encoding_error(ch, 3, st);
277 return post_process_read_wchar(wch, 3, st);
278 } else {
279 int c1 = st->stream_getc(sno);
280 if (c1 == -1)
281 return post_process_weof(st);
282 int c2 = st->stream_getc(sno);
283 if (c2 == -1)
284 return post_process_weof(st);
285 int c3 = st->stream_getc(sno);
286 if (c3 == -1)
287 return post_process_weof(st);
288 if (ch == 0xf0) {
289 if (c1 < 0x90) return Yap_encoding_error(ch, 4, st);
290 } else if (c1 == 0xf4) {
291 if (c2 > 0x8f) return Yap_encoding_error(ch, 4, st);
292 }
293 wch = ((ch & 7)<<18) | ((c1 & 0x3f)<<12) | ((c2 & 0x3f)<<6) | (c3 & 0x3f);
294 return post_process_read_wchar(wch, 4, st);
295 }
296 }
297}
int(* stream_getc)(int)
function the stream uses for writing a character
Definition: YapStreams.h:256
encoding_t encoding
check if the next wide character is available
Definition: YapStreams.h:265