-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_decode_utf8.c
149 lines (137 loc) · 6.19 KB
/
test_decode_utf8.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#include <stdio.h>
#include <stdlib.h> // needed for exit()
#include <stdbool.h> // needed for is1butf8() etc.
#include <string.h> // needed for strlen() and for strcspn()
/***************************************************************/
/* README */
/***************************************************************/
/* This program tests the funtion decode_utf8(), which takes
a Unicode character in the form of the hexadecimal value of
a UTF-8 character and converts it into the corresponding
Unicode code point in the form U+(hex value).
For example, 0xD096, which corresponds to Russian capital
letter Ж (ZH) is converted into Unicode code point U+0416.
Written by Thomas Hedden April 2021.
Modified in February 2023.
*/
/***************************************************************/
/* THINGS THAT NEED ATTENTION */
/***************************************************************/
/* #include's and #define's should be bracketed by INFDEFINE's
#include's that are not used should be removed
#define's that are not used should be removed
variables that are not used should be removed
functions that are not used should be removed
allocated memory should be freed
files should be closed
testing statements should be removed
commented out code should be removed
*/
/***************************************************************/
/* #DEFINE STATEMENTS */
/***************************************************************/
#define MAXARRAY 80 /* size of array into which words
(one word per line expected)
from input files are placed */
#define MXFNSZ 256 /* maximum length of filenames (using
Second Extended Filesystem) */
#define MXSZVAR 32 /* maximum size of ANSI C variables,
including '\0' */
/***************************************************************
* FUNCTION DECLARATIONS *
***************************************************************/
void getinputfile(unsigned char []); // prompt for file name
FILE* open_file(char*, char*);
int fgetu(FILE *); // gets UTF-8 character from FILE *
bool isb1ofu8(unsigned char);
bool is1butf8(unsigned int);
bool is2butf8(unsigned int);
bool isb1of2b(unsigned int);
bool istbutf8(unsigned int);
bool is3butf8(unsigned int);
bool isb1of3b(unsigned int);
bool is4butf8(unsigned int);
bool isb1of4b(unsigned int);
bool isspaceu(unsigned int);
bool isutf8bom(unsigned int);
// converts UTF-8 byte to Unicode code point
unsigned long int decode_utf8(unsigned int);
/***************************************************************
* MAIN FUNCTION *
***************************************************************/
int main(int argc, char *argv[]) {
// this program requires that the size of an int be 4 bytes
if( sizeof(int) != 4 ) {
fprintf(stderr, "sizeof(int) is not 4!\n");
exit(EXIT_FAILURE);
}
/************************************************************
* VARIABLE DECLARATIONS *
************************************************************/
unsigned char i_file_name[MXFNSZ]; // holds input file name
FILE *infp; // pointer to input stream of type FILE
int i; // loop iterator
unsigned char word_buffer[MXFNSZ]; // array to hold word
unsigned int u; // holds UTF-8 value from fgetu()
/************************************************************
* CHECK INPUT *
************************************************************/
if( argc == 1 ) { // no input file name entered ...
getinputfile(i_file_name); // ... so prompt for it
} else { // input file name entered, so put in it i_file_name
for(i = 0; i < MXFNSZ && argv[1][i] != '\0'; i++) {
i_file_name[i] = argv[1][i];
}
i_file_name[i] = '\0';
}
/************************************************************
* OPEN FILE *
************************************************************/
// open input file to be processed
infp = open_file(i_file_name, "r");
/************************************************************
* GET INPUT FROM INPUT STREAM & OUTPUT TO OUTPUT STREAM *
************************************************************/
while( u = fgetu(infp) ) { // traverse stream
// if found end of file
if( feof(infp) || u == EOF ) {
break; // found EOF, so stop, don't print anything
}
if( isspaceu(u) ) { // print new line instead of space
fprintf(stdout, "\n");
continue;
}
if( u == 0x0A ) { // print real new line, not its code point
fprintf(stdout, "\n");
continue;
}
if( u == 0x0d ) { // ignore carriage returns
continue;
}
// found UTF-8 BOM, ignore it instead of converting it
if(isutf8bom(u)) {
// If it is desired to convert UTF-8 BOM 0xEFBBBF
// to big-endian UTF-16 BOM 0xFEFF, uncomment the
// following three lines:
// fprintf(stdout, "U+");
// fprintf(stdout, "%#04x", (unsigned int) (decode_utf8(u) >> 32));
// fprintf(stdout, " ");
continue;
}
fprintf(stdout, "U+");
fprintf(stdout, "%#04x", (unsigned int) decode_utf8(u));
fprintf(stdout, " ");
continue;
}
// fprintf(stdout, "\n");
/**********************************************************
* CLOSE FILES *
**********************************************************/
// close the file stream of file to process fclose(infp); return(0);
fclose(infp);
return(0);
}
/***************************************************************/
/* FUNCTION DEFINITIONS */
/***************************************************************/
// all other functions have been put in separate files