#include #include #include #include #include #include int verbose1 = false; // show gross parsing state changes int verbose2 = false; // show legitimate non-cp0 UTF-8 characters int verbose3 = false; // show cp0 characters (ASCII) int verbose5 = false; // show UTF-8 problems unsigned long long location = 0; // offset from beginning of stream enum utf8_state { cp0, // code point 0 --> U+0000..U+007F, also known as ASCII cp1, // code point 1 --> U+0080..U+07FF, also known as two-byte UTF8 cp2, // code point 2 --> U+0800..U+0FFF, also known as three-byte UTF8 cp3, // code point 3 --> U+1000..U+CFFF, also known as three-byte UTF8 cp4, // code point 4 --> U+D000..U+D7FF, also known as three-byte UTF8 cp5, // code point 5 --> U+E000..U+FFFF, also known as three-byte UTF8 cp6, // code point 6 --> U+10000..U+3FFFF, also known as four-byte UTF8 cp7, // code point 7 --> U+40000..U+FFFFF, also known as four-byte UTF8 cp8, // code point 8 --> U+100000..U+10FFFF, also known as four-byte UTF8 cp2_1, // cp2 second byte state cp3_1, // cp3 second byte state cp4_1, // cp4 second byte state cp5_1, // cp5 second byte state cp6_1, // cp6 second byte state cp7_1, // cp7 second byte state cp8_1, // cp8 second byte state cp6_2, // cp6 third byte state cp7_2, // cp7 third byte state cp8_2 // cp8 third byte state }; char *state_name[cp8_2+1]; // returns false if state machine fails; also, the new_state is reset back to cp0 bool fsm_utf8(uint8_t c, int current_state, int *new_state) { static uint8_t byte_0; static uint8_t byte_1; static uint8_t byte_2; static uint8_t byte_3; switch(current_state) { case cp0: byte_0 = c; if(c >= 0x0 && c <= 0x7f) { // stay in same cp0 state *new_state = cp0; if(verbose3) write(1,&byte_0,1); } else if(c >= 0xc2 && c <= 0xdf) { // go to cp1 state, need another byte *new_state = cp1; } else if(c == 0xe0) { } /* .... YOUR CODE GOES HERE */ case cp8_2: byte_3 = c; if(c >= 0x80 && c <= 0xbf) { // recognized! back to cp0 *new_state = cp0; if(verbose2) { write(1,&byte_0,1); write(1,&byte_1,1); write(1,&byte_2,1); write(1,&byte_3,1); } return(true); } // error! back to cp0 if(verbose5) { char s[512]; sprintf(s,"\nThere are four ill-formed bytes starting at offset %u: %x %x\n",location-3,byte_0,byte_1,byte_2,byte_3); write(1,s,strlen(s)); } *new_state = cp0; return(false); break; default: printf("This is impossible!\n"); return(false); } } void do_options(int argc, char *argv[]) { int opt; while((opt = getopt(argc, argv, "uap")) != -1) { switch(opt) { case 'u': verbose2 = true; break; case 'a': verbose3 = true; break; case 'p': verbose5 = true; break; default: fprintf(stderr,"Usage: %s [-u] [-a] [-p]\n",argv[0]); exit(1); break; } } } int main(int argc, char *argv[]) { do_options(argc, argv); state_name[cp0] = "cp0"; state_name[cp1] = "cp1"; state_name[cp2] = "cp2"; state_name[cp3] = "cp3"; state_name[cp4] = "cp4"; state_name[cp5] = "cp5"; state_name[cp6] = "cp6"; state_name[cp7] = "cp7"; state_name[cp8] = "cp8"; state_name[cp2_1] = "cp2_1"; state_name[cp3_1] = "cp3_1"; state_name[cp4_1] = "cp4_1"; state_name[cp5_1] = "cp5_1"; state_name[cp6_1] = "cp6_1"; state_name[cp7_1] = "cp7_1"; state_name[cp8_1] = "cp8-1"; state_name[cp6_2] = "cp6_2"; state_name[cp7_2] = "cp7_2"; state_name[cp8_2] = "cp8_2"; uint8_t c; int current_state = cp0; int new_state = cp0; while(read(0,&c,1) == 1) { bool okay = fsm_utf8(c,current_state,&new_state); if(verbose1) { write(1,&c,1); printf(" : current_state = %s, new_state = %s \n",state_name[current_state],state_name[new_state]); fflush(stdout); } current_state = new_state; location++; } }