From nmyers@mntgfx.MENTOR.COM Wed Jan 14 17:35:26 1987 Path: beno!seismo!husc6!panda!genrad!decvax!tektronix!sequent!mntgfx!nmyers From: nmyers@mntgfx.MENTOR.COM (Nathan Myers) Newsgroups: net.sources Subject: C++ Lexical Scanner (in C++) Keywords: object parse compiler cfront Message-ID: <431@mntgfx.MENTOR.COM> Date: 14 Jan 87 22:35:26 GMT Organization: Mentor Graphics, Beaverton OR Lines: 1054 C++ Lexical Scanner (in C++) Following this note is a shar-format file containing a lexical scanner for C++. Use it however you like. It should be adequate to drop into a full compiler. For such a use, error reporting would probably need some refinement. Also, if the compiler was to have a built-in preprocessor (i.e. not using cpp) the handler for "#" lines would need a bit of work. I will be interested in bug reports, significant improvements. Nathan Myers tektronix!sequent!mntgfx!nmyers Share and enjoy! -------------------- cut here ----------------------------- #! /bin/sh # This is a shar format file. Extract with sh, not csh. # echo "x - read.me" sed -e 's/^_ //' >read.me <<'%%%EOF%%%' _ _ The following source files are included: _ _ clex.c _ clex.h _ clex_sym.h _ clex_test.c _ kwhash.c _ Makefile _ _ They implement a self-contained lexical scanner class for C++. _ It is extensible by derivation primarily in the area of _ processing "#" compiler directives (currently, it only _ interprets the "#line" construct). _ _ It has one other degree of flexibility, in its handling _ of bracket-enclosed expressions "[]". These may be treated _ as a normal sequence of tokens or as delimited strings; _ the former is of greater use in a traditional parser, while _ the latter is favored for extraction of declarations by _ a code browser. _ _ To allay some confusion, I should point out here that _ clex_sym.h is used in an unusual way: it is included _ twice in the module clex.c; once for declaration _ part, once for the (static) definition part. It is _ built this way to keep all knowledge of keywords _ in a single place. _ _ The file kwhash.c is a standard C standalone program used _ to arrive at the collision-free hash function used to _ recognize C++ keywords. Any new keyword stands about _ one chance in 3 of colliding with an existing keyword, _ thus requiring that a new hash function be generated. _ _ The file clex_test.c compiles to a program which reads _ C or C++ code from standard input and emits token names _ on the standard output. Try it with different values _ in the constructor's second argument. %%%EOF%%% echo "x - Makefile" sed -e 's/^_ //' >Makefile <<'%%%EOF%%%' _ # Makefile for clex/cparse _ INCLUDE=/usr/local/include/CC _ CCC=/user/local/CC _ CCOPTS= -g -O _ # -g: include debug info -O: optimize _ _ all: kwhash clex_test _ _ kwhash: kwhash.c /usr/include/stdio.h _ /bin/cc ${CCOPTS} -o kwhash kwhash.c _ _ clex_test: clex_test.o clex.o _ ${CCC} -g -o clex_test clex_test.o clex.o _ _ .c.o: _ /usr/lib/cpp -I${INCLUDE} -Dc_plusplus=1 $*.c >$*.cpp _ /user/mentor/local/cfront +L +f$*.c <$*.cpp >$*..c && /bin/rm $*.cpp _ /bin/cc -c ${CCOPTS} $*..c && /bin/mv $*..o $*.o _ _ clex.o: ${INCLUDE}/stdio.h \ _ ${INCLUDE}/stream.h \ _ ${INCLUDE}/string.h \ _ ${INCLUDE}/stdlib.h \ _ ${INCLUDE}/ctype.h \ _ ${INCLUDE}/assert.h \ _ clex.h clex_sym.h _ _ clex_test.o: ${INCLUDE}/stdio.h \ _ clex.h clex_sym.h _ %%%EOF%%% echo "x - clex.h" sed -e 's/^_ //' >clex.h <<'%%%EOF%%%' _ _ _ #ifndef INCLUDED_CLEX _ #define INCLUDED_CLEX 1 _ _ #ifndef INCLUDED_STDIO _ #include _ #endif _ _ enum Boolean { FALSE, TRUE }; _ _ #include "clex_sym.h" _ _ class Clex _ { _ friend class Cparse; _ enum Clex_mode _ { CL_NONE=0, CL_COMMENT=1, CL_QUOTE=2, CL_POUND=4, CL_BRACK=8 }; _ _ protected: _ short look; // a one-char lookahead _ FILE* fp; _ Boolean block_brack; // if TRUE, treat contents of "[]" as a string _ long line_num; // line number in original source file _ char filename[256]; // name of original source file _ short bufsiz; // number of chars currently in buf _ char buf[256]; _ _ void eat_one() { look = short(getc(fp)); } _ void put_in_buf(char c) { if (bufsiz < sizeof(buf)-1) buf[bufsiz++] = c; } _ void buf_one() { put_in_buf(look); eat_one(); } _ Clex_sym terminate(Clex_sym s) { buf[bufsiz] = '\0'; return s; } _ Clex_sym eat_return(Clex_sym); _ Clex_sym num(char); _ Clex_sym ident(char); _ Clex_sym lbrack(Clex_mode); _ Clex_sym quote(char, Clex_sym, Clex_mode); _ void block_comment(Clex_mode); _ void line_comment(); _ void eoln(Clex_mode); _ _ virtual Boolean pound(Clex_mode, char*, short len); _ _ public: _ Clex_sym next(); _ const char* str() { return buf; } _ short strlen() { return bufsiz; } _ long line_no() { return line_num; } _ const char* fname() { return filename; } _ const char* debug(Clex_sym); _ _ Clex(FILE*, Boolean block_brack); _ }; _ _ #endif _ %%%EOF%%% echo "x - clex_sym.h" sed -e 's/^_ //' >clex_sym.h <<'%%%EOF%%%' _ _ /* clex_sym.h: _ // This file defines both an enum {} type named "symbol", and _ // a variable sym_str[] with string representations of the _ // symbols. It is intended to maintain an exact _ // correspondence between array entries and symbol values. _ */ _ _ /* _ This file is #include'd twice: once for the enum _ (with CLEX_IMPLEMENTATION turned off) and once for _ the array initialization (with it turned on). The _ lower-numbered symbols have uppercase name strings, _ but the keyword symbol strings are stored separately. _ _ If a keyword is to be added, add it first to the _ standalone program kwhash.c and generate a new _ perfect hash function for the new set. Then add _ it to both places below and modify the hash function _ and table size in clex.c. _ */ _ _ #ifndef CLEX_IMPLEMENTATION _ _ #define CLEX_S(sym) sym _ #define CLEX_S2(sym1,sym2) sym1 _ enum Clex_sym _ { _ _ #else /* CLEX_IMPLEMENTATION */ _ _ #undef CLEX_S _ #undef CLEX_S2 _ #define CLEX_S(sym) "sym" _ #define CLEX_S2(sym1,sym2) sym2 _ static char* sym_str[] = _ { _ _ #endif /* CLEX_IMPLEMENTATION */ _ _ CLEX_S(NONE_S = 0), /* should never get this */ _ _ CLEX_S(ERROR_S), _ CLEX_S( ERROR_EOLN_S), _ CLEX_S( ERROR_EOF_S), _ CLEX_S( ERROR_UNKN_S), _ #ifndef CLEX_IMPLEMENTATION _ CLEX_S(ERROR_MAX = ERROR_UNKN_S), _ #endif _ _ CLEX_S(EOF_S), _ CLEX_S(EOLN_S), // \n _ _ CLEX_S(BANG_S), // ! _ CLEX_S( NE_S), // != _ CLEX_S(QUOTE_S), // " _ CLEX_S(POUND_S), // # _ CLEX_S(MOD_S), // % _ CLEX_S( MODAS_S), // %= _ CLEX_S(AMPER_S), // & _ CLEX_S( LAND_S), // && _ CLEX_S( ANDAS_S), // &= _ CLEX_S(APOS_S), // ' _ CLEX_S(LPAR_S), // ( _ CLEX_S(RPAR_S), // ) _ CLEX_S(STAR_S), // * _ CLEX_S( MULAS_S), // *= _ CLEX_S(PLUS_S), // + _ CLEX_S( INCRE_S), // ++ _ CLEX_S( ADDAS_S), // += _ CLEX_S(COMMA_S), // ), _ CLEX_S(MINUS_S), // - _ CLEX_S( DECRE_S), // -- _ CLEX_S( SUBAS_S), // -= _ CLEX_S( DEREF_S), // -> _ CLEX_S(DOT_S), // . _ CLEX_S( ELLIP_S), // ... _ CLEX_S(SLASH_S), // / _ CLEX_S( DIVAS_S), // /= _ CLEX_S(COLON_S), // : _ CLEX_S( SCOPE_S), // :: _ CLEX_S(SEMI_S), // ; _ CLEX_S(LT_S), // < _ CLEX_S( LE_S), // <= _ CLEX_S( SHL_S), // << _ CLEX_S( SHLAS_S), // <<= _ CLEX_S(AS_S), // = _ CLEX_S( EQ_S), // == _ CLEX_S(GT_S), // > _ CLEX_S( GE_S), // >= _ CLEX_S( SHR_S), // >> _ CLEX_S( SHRAS_S), // >>= _ CLEX_S(QUEST_S), // ? _ CLEX_S(AT_S), // @ (undefined) _ CLEX_S(LBRACK_S), // [ _ CLEX_S(BSLASH_S), // \ _ CLEX_S(RBRACK_S), // ] _ CLEX_S(CARET_S), // ^ _ CLEX_S( XORAS_S), // ^= _ CLEX_S(GRAVE_S), // ` (undefined) _ CLEX_S(LBRACE_S), // { _ CLEX_S(VBAR_S), // | _ CLEX_S( LOR_S), // || _ CLEX_S( ORAS_S), // |= _ CLEX_S(RBRACE_S), // } _ CLEX_S(TILDE_S), // ~ _ _ CLEX_S(IDENT_S), // a name, or string that could be a name _ CLEX_S(NUM_S), // a numeric string _ CLEX_S(FLOATNUM_S) // a recognizably floating-point num _ _ #ifndef CLEX_IMPLEMENTATION _ , CLEX_S(KEYWORD_S), _ _ #else _ _ }; _ static char *keywords[] = _ { _ _ #endif _ _ CLEX_S2(ASM_S = KEYWORD_S, "asm"), _ CLEX_S2(AUTO_S, "auto"), _ CLEX_S2(BREAK_S, "break"), _ CLEX_S2(CASE_S, "case"), _ CLEX_S2(CHAR_S, "char"), _ CLEX_S2(CLASS_S, "class"), _ CLEX_S2(CONST_S, "const"), _ CLEX_S2(CONTINUE_S, "continue"), _ CLEX_S2(DEFAULT_S, "default"), _ CLEX_S2(DELETE_S, "delete"), _ CLEX_S2(DO_S, "do"), _ CLEX_S2(DOUBLE_S, "double"), _ CLEX_S2(ELSE_S, "else"), _ CLEX_S2(ENUM_S, "enum"), _ CLEX_S2(EXTERN_S, "extern"), _ CLEX_S2(FLOAT_S, "float"), _ CLEX_S2(FOR_S, "for"), _ CLEX_S2(FRIEND_S, "friend"), _ CLEX_S2(GOTO_S, "goto"), _ CLEX_S2(IF_S, "if"), _ CLEX_S2(INLINE_S, "inline"), _ CLEX_S2(INT_S, "int"), _ CLEX_S2(LONG_S, "long"), _ CLEX_S2(NEW_S, "new"), _ CLEX_S2(OPERATOR_S, "operator"), _ CLEX_S2(OVERLOAD_S, "overload"), _ CLEX_S2(PRIVATE_S, "private"), _ CLEX_S2(PROTECTED_S,"protected"), _ CLEX_S2(PUBLIC_S, "public"), _ CLEX_S2(REGISTER_S, "register"), _ CLEX_S2(RETURN_S, "return"), _ CLEX_S2(SHORT_S, "short"), _ CLEX_S2(SIGNED_S, "signed"), _ CLEX_S2(SIZEOF_S, "sizeof"), _ CLEX_S2(STATIC_S, "static"), _ CLEX_S2(STRUCT_S, "struct"), _ CLEX_S2(SWITCH_S, "switch"), _ CLEX_S2(THIS_S, "this"), _ CLEX_S2(TYPEDEF_S, "typedef"), _ CLEX_S2(UNION_S, "union"), _ CLEX_S2(UNSIGNED_S, "unsigned"), _ CLEX_S2(VIRTUAL_S, "virtual"), _ CLEX_S2(VOLATILE_S, "volatile"), _ CLEX_S2(VOID_S, "void"), _ CLEX_S2(WHILE_S, "while"), _ _ CLEX_S2(END_OF_SYMBOLS_S, NULL) _ }; _ _ #ifndef CLEX_IMPLEMENTATION _ const CLEX_NUMKEYS = (END_OF_SYMBOLS_S - KEYWORD_S); _ #endif _ %%%EOF%%% echo "x - clex.c" sed -e 's/^_ //' >clex.c <<'%%%EOF%%%' _ _ #ifndef INCLUDED_STREAM _ #include _ #endif _ #ifndef INCLUDED_STRING _ #include _ #endif _ #ifndef INCLUDED_STDLIB _ #include _ #endif _ #ifndef INCLUDED_ASSERT _ #include _ #endif _ #ifndef INCLUDED_CTYPE _ #include _ #endif _ _ #include "clex.h" _ _ // get string value tables, sym_str[] and keyword[] : _ #define CLEX_IMPLEMENTATION 1 _ #include "clex_sym.h" _ _ /****************************************************************************** _ * * _ * KWTABLE -- keyword hash table (internal use only) * _ * KWtable implements a collision-free hash table of C++ keywords. The * _ * table size and hash function are computed by use of a standalone C * _ * program, kwhash.c, included in this directory. * _ * * _ ******************************************************************************/ _ _ #define U_short unsigned short _ #define U_char unsigned char _ _ struct KWtable _ { _ enum { HASHSIZE = 131 }; // as computed by kwhash.c, for a=9,b=2,c=2 _ _ struct { _ char* kwp; _ Clex_sym sym; _ } kwhash[HASHSIZE]; _ _ KWtable(char**); _ U_short hash(const U_char*, U_short len); _ void insert(char*, Clex_sym); _ Clex_sym lookup(char*, short len); _ }; _ _ static KWtable kwt = KWtable(keywords); // keywords[] defined in Clex_sym.h _ _ KWtable:: _ KWtable (char** kwl) _ { _ short int i; _ for (i = 0; i < HASHSIZE; ++i) _ kwhash[i].kwp = NULL; _ for (i = 0; i < CLEX_NUMKEYS; ++i) _ insert(kwl[i], KEYWORD_S + i); _ // rely on assert() to prevent hash collisions -- may need _ // a new hash function or table size when keyword added. _ } _ _ // the values used in the following hash function, and HASHSIZE, were _ // determined by use of the standalone C program kwhash.c, to _ // ensure that no collisions occur. _ _ inline _ U_short KWtable:: _ hash (const U_char* cp, U_short len) _ { _ return (((U_short)cp[0] ) ^ _ ((U_short)cp[1] << 9) ^ _ ((U_short)cp[len-1] << 2) ^ _ (len << 2) ) % HASHSIZE; _ } _ _ void KWtable:: _ insert (char* cp, Clex_sym s) _ { _ U_short h = hash(cp, strlen(cp)); _ assert(kwt.kwhash[h].kwp == NULL); // collisions not permitted. _ kwt.kwhash[h].kwp = cp; _ kwt.kwhash[h].sym = s; _ } _ _ Clex_sym KWtable:: _ lookup (char* cp, short len) _ { _ if (len < 2 || len > 9) return (IDENT_S); _ short h = hash(cp, len); _ if (kwt.kwhash[h].kwp == NULL) return (IDENT_S); _ if (strcmp(kwt.kwhash[h].kwp, cp)) return (IDENT_S); _ return (kwt.kwhash[h].sym); _ } _ _ /****************************************************************************** _ * * _ * CLEX -- c++ lexical scanner * _ * * _ ******************************************************************************/ _ _ // CONSTRUCTOR Clex: _ // The argument block_brack, if TRUE, dictates that the contents _ // of square brackets "[]" be returned as a string in the string _ // buffer. If false, square brackets are treated as simple tokens. _ _ Clex:: _ Clex (FILE* f, Boolean b) _ { _ fp = f; _ block_brack = b; _ filename[0] = '\0'; _ bufsiz = 0; buf[0] = '\0'; _ // prime the pipeline: _ line_num = 0; _ look = '\n'; // be prepared to handle '#' as first char _ } _ _ Clex_sym Clex:: _ num (char c) _ { _ Clex_sym s = NUM_S; _ _ bufsiz = 0; _ put_in_buf(c); _ while (isdigit(look)) _ buf_one(); _ _ // hexadecimal _ if (bufsiz == 1 && *buf == '0' && (look == 'x' || look == 'X')) _ { _ do { buf_one(); } _ while (isxdigit(look)); _ if (look == 'L' || look == 'l' || look == 'U' || look == 'u') _ buf_one(); _ return terminate(s); _ } _ _ // long or unsigned _ if (look == 'L' || look == 'l' || look == 'U' || look == 'u') _ { buf_one(); return terminate(NUM_S); } _ _ // floating point _ else if (look == '.') _ { _ s = FLOATNUM_S; _ do { buf_one(); } _ while (isdigit(look)); _ } _ _ // scientific notation _ if (look == 'e' || look == 'E') _ { _ s = FLOATNUM_S; _ do { buf_one(); } _ while (isdigit(look)); _ } _ else _ return terminate(s); _ _ if (look == '+' || look == '-') _ do { buf_one(); } _ while (isdigit(look)); _ return terminate(s); _ } _ _ Clex_sym Clex:: _ ident (char first) _ { _ register Boolean maybe_kw = TRUE; _ register short bs = 0; _ buf[bs++] = first; _ while (isalnum(look) || look == '_' || look == '$') _ { _ // note: this function accounts for 30% of the total scan time _ if (maybe_kw && (isupper(look) || look == '_' )) _ maybe_kw = FALSE; _ buf[bs++] = look; // don't worry about overflow _ eat_one(); _ } _ buf[bs] = '\0'; _ bufsiz = bs; _ _ if (maybe_kw) _ return kwt.lookup(buf, bufsiz); _ return IDENT_S; _ } _ _ Clex_sym Clex:: _ quote (char c, Clex_sym s, Clex_mode m) _ { _ if (m == CL_NONE) _ bufsiz = 0; _ while (look != c) _ { _ if (look == EOF) _ { return terminate(ERROR_EOF_S); } _ else if (look == '\n') _ { return terminate(ERROR_EOLN_S); } _ else if (look == '\\') _ { _ eat_one(); _ if (look == '\n') _ { eat_one(); eoln(m|CL_QUOTE); continue; } _ else if (look == EOF) _ { return terminate(ERROR_EOF_S); } _ else _ put_in_buf('\\'); // this handles \' and \" too. _ } _ buf_one(); _ } _ eat_one(); // eat the closing quote _ return terminate(s); _ } _ _ _ // lbrack() accumulates the contents between "[" and "]" into _ // the string buffer, handling syntactically quoted strings, _ // comments, and nested brackets. Note that lbrack() is _ // called recursively in the case of nested brackets. _ _ Clex_sym Clex:: _ lbrack (Clex_mode m) _ { _ if (m == CL_NONE) _ bufsiz = 0; _ while (look != ']') _ { _ if (look == EOF) _ return terminate(ERROR_EOF_S); _ _ else if (look == '\n') _ { eat_one(); eoln(m|CL_BRACK); } _ else if (look == '[') _ { _ buf_one(); _ if (lbrack(m|CL_BRACK) == ERROR_EOF_S) _ return ERROR_EOF_S; // already cleaned up. _ else put_in_buf(']'); _ } _ else if (look == '\'' || look == '"') _ { _ char c = look; _ buf_one(); _ (void) quote(c, NONE_S, m|CL_BRACK); _ put_in_buf(c); _ } _ else if (look == '/') // maybe a comment _ { _ eat_one(); _ if (look == '/') _ line_comment(); _ else if (look == '*') _ { _ block_comment(m|CL_BRACK); _ if (look == EOF) return terminate(ERROR_EOF_S); _ } _ else // stash the '/' and the char after _ { put_in_buf('/'); buf_one(); } _ } _ else // just a character to save _ buf_one(); _ } _ _ eat_one(); // eat the ']'. _ return terminate(LBRACK_S); _ } _ _ _ void Clex:: _ block_comment(Clex_mode m) _ { _ eat_one(); // eat the '*' _ while (! (look == '*' && (eat_one(), look == '/')) ) _ { _ if (look == EOF) return; _ if (look == '\n') { eat_one(); eoln(m|CL_COMMENT); } _ else if (look != '*') eat_one(); _ } _ eat_one(); // eat the '/' _ } _ _ void Clex:: _ line_comment() _ { _ do { eat_one(); } _ while (look != '\n' && look != EOF); _ } _ _ // eat_return() is intended to save space in Clex::next() -- the _ // inline function eat_one() produces quite a lot of code. _ Clex_sym Clex:: _ eat_return(Clex_sym s) _ { eat_one(); return s; } _ _ Clex_sym Clex:: _ next() _ { _ short val; _ while (val = look, eat_one(), val != EOF) _ { _ char ch = char(val); _ switch (ch) _ { _ case ' ' : continue; _ _ case '_' : _ case '$' : return ident(ch); _ _ case '0' : case '1' : case '2' : case '3' : case '4' : _ case '5' : case '6' : case '7' : case '8' : case '9' : _ return num(ch); _ _ case ',' : return COMMA_S; _ case ';' : return SEMI_S; _ case '[' : if (block_brack) return lbrack(CL_NONE); _ else return LBRACK_S; _ case ']' : return RBRACK_S; _ case '{' : return LBRACE_S; _ case '}' : return RBRACE_S; _ case '(' : return LPAR_S; _ case ')' : return RPAR_S; _ case '~' : return TILDE_S; _ case '?' : return QUEST_S; _ case '"' : return quote(ch, QUOTE_S, CL_NONE); _ case '\'': return quote(ch, APOS_S, CL_NONE); _ _ case '=' : // '=', '==' _ if (look != '=') return AS_S; _ else return eat_return(EQ_S); _ _ case ':' : // ":", "::" _ if (look != ':') return COLON_S; _ else return eat_return(SCOPE_S); _ _ case '!' : // "!", "!=" _ if (look != '=') return BANG_S; _ else return eat_return(NE_S); _ _ case '^' : // "^", "^=" _ if (look != '=') return CARET_S; _ else return eat_return(XORAS_S); _ _ case '*' : // '*', '*=' _ if (look != '=') return STAR_S; _ else return eat_return(MULAS_S); _ _ case '%' : // '%', '%=' _ if (look != '=') return MOD_S; _ else return eat_return(MODAS_S); _ _ case '|' : // "|=", "||", "|" _ if (look == '|') return eat_return(LOR_S); _ else if (look == '=') return eat_return(ORAS_S); _ else return VBAR_S; _ _ case '&' : // "&", "&=", "&&" _ if (look == '&') return eat_return(LAND_S); _ else if (look == '=') return eat_return(ANDAS_S); _ else return AMPER_S; _ _ case '+' : // '+', '++', '+=' _ if (look == '+') return eat_return(INCRE_S); _ else if (look == '=') return eat_return(ADDAS_S); _ else return PLUS_S; _ _ case '-' : // '--', '-=', '->', '-', _ if (look == '-') return eat_return(DECRE_S); _ else if (look == '=') return eat_return(SUBAS_S); _ else if (look == '>') return eat_return(DEREF_S); _ else return MINUS_S; _ _ case '/' : // '/*', '//', '/=', '/' _ if (look == '*') _ { _ block_comment(CL_NONE); _ if (look == EOF) // almost certainly a mistake: _ return ERROR_EOF_S; _ else continue; _ } _ else if (look == '/') _ { line_comment(); continue; } _ else if (look == '=') return eat_return(DIVAS_S); _ else return SLASH_S; _ _ case '.' : // ".", "..." _ if (isdigit(look)) return num(ch); _ else if (look == '.') _ { _ eat_one(); // check for "..", undefined. _ if (look != '.') return ERROR_UNKN_S; _ else return eat_return(ELLIP_S); _ } _ else return DOT_S; _ _ case '<' : // '<=', '<', '<<', '<<=' _ if (look == '=') return eat_return(LE_S); _ else if (look == '<') _ { _ eat_one(); _ if (look != '=') return SHL_S; _ else return eat_return(SHLAS_S); _ } _ else return LT_S; _ _ case '>' : // '>=', '>', '>>', '>>=' _ if (look == '=') return eat_return(GE_S); _ else if (look == '>') _ { _ eat_one(); _ if (look != '=') return SHR_S; _ else return eat_return(SHRAS_S); _ } _ else return GT_S; _ _ default: _ if (isalpha(ch)) _ return ident(ch); _ if (ch == '\n') _ eoln(CL_NONE); _ else if (iscntrl(ch)) _ continue; _ else _ return ERROR_UNKN_S; _ } _ } _ _ return EOF_S; _ } _ _ struct Quickbuf _ { _ short len; _ char line[10240]; _ void put_in(char c) { if (len < sizeof(line)-1) line[len++] = c; } _ void terminate() { line[len] = '\0'; } _ Quickbuf() { len = 0; } _ }; _ _ void Clex:: _ eoln(Clex_mode m) _ { _ // assume NL character already eaten. _ ++line_num; _ // don't process '#' lines in quotes, comments, or '#' continuations. _ if (m & (CL_QUOTE|CL_POUND|CL_COMMENT)) _ return; _ _ // eat whitespace _ while (look != EOF && look != '\n') _ { _ if (look == ' ' || iscntrl(char(look))) eat_one(); _ else break; _ } _ if (look != '#') _ return; _ _ // eat the '#' and subsequent whitespace _ do { eat_one(); if (look == EOF || look == '\n') break; } _ while (look == ' ' || iscntrl(char(look))); _ _ // collect the '#' line _ Quickbuf b; _ do { // record line _ if (look == '\\') // check for continuation line _ { _ eat_one(); _ if (look == '\n') { eat_one(); eoln(m|CL_POUND); } _ else { b.put_in('\\'); } _ } _ else if (look == '/') // check for comment in '#' line _ { _ eat_one(); _ if (look == '*') _ { _ block_comment(m|CL_POUND); _ if (look == EOF) break; _ } _ else if (look == '/') line_comment(); _ else { b.put_in('/'); } _ } _ else _ { _ if (iscntrl(char(look))) look = ' '; _ b.put_in(look); _ eat_one(); _ } _ _ } while (look != '\n' && look != EOF); _ b.terminate(); _ _ (void) pound(m, b.line, b.len); // call virtual handler _ } _ _ Boolean Clex:: _ pound (Clex_mode m, char* line, short len) _ { _ void(m); // to keep cfront blissful _ char* cp = line; _ if (!isdigit(*cp)) _ { _ if (len < 5) return FALSE; _ if (strncmp(cp, "line ", 5) != 0) _ return FALSE; // don't know what it is _ cp += 4; _ while (*cp == ' ') ++cp; _ if (!isdigit(*cp)) _ return FALSE; _ } _ _ // # "" or #line "" _ line_num = atoi(cp) - 1; // will be incremented by eoln() later _ while (isdigit(*cp)) ++cp; _ while (*cp == ' ') ++cp; _ if (*cp == '"') _ { _ char* cpq = cp; _ do { ++cpq; } _ while (*cpq != '"' && *cpq != '\0'); _ strncpy(filename, cp+1, cpq - cp - 1); _ filename[cpq - cp - 1] = '\0'; _ } _ _ return TRUE; _ } _ _ const char* Clex:: _ debug (Clex_sym s) _ { _ return (s >= KEYWORD_S) ? keywords[s - KEYWORD_S] : sym_str[s] ; _ } %%%EOF%%% echo "x - kwhash.c" sed -e 's/^_ //' >kwhash.c <<'%%%EOF%%%' _ _ /* this is a C program */ _ _ #include _ _ static char *keywords[] = _ { _ "asm", _ "auto", _ "break", _ "case", _ "char", _ "class", _ "const", _ "continue", _ "default", _ "delete", _ "do", _ "double", _ "else", _ "enum", _ "extern", _ "float", _ "for", _ "friend", _ "goto", _ "if", _ "inline", _ "int", _ "long", _ "new", _ "operator", _ "overload", _ "private", _ "protected", _ "public", _ "register", _ "return", _ "short", _ "signed", _ "sizeof", _ "static", _ "struct", _ "switch", _ "this", _ "typedef", _ "union", _ "unsigned", _ "virtual", _ "volatile", _ "void", _ "while" _ }; _ _ #define KW_NUMKEYS (sizeof(keywords)/sizeof(*keywords)) _ _ unsigned int hashsize = 137; _ char** kwhash; _ typedef unsigned short u_short; _ _ u_short _ hash(cp, len, a, b, c) _ unsigned char* cp; _ u_short len; _ u_short a, b, c; _ { _ return (((u_short)cp[0] ) ^ _ ((u_short)cp[1] << a) ^ _ ((u_short)cp[len-1] << b) ^ _ (len << c) ) % hashsize; _ } _ _ int _ insert(cp, a, b, c) _ char *cp; _ u_short a, b, c; _ { _ short h; _ _ h = hash(cp, strlen(cp), a, b, c); _ if (kwhash[h] != NULL) _ { _ /* _ printf("Keyword hash collision: %s, %s\n", kwhash[h], cp); _ */ _ return 0; _ } _ else _ kwhash[h] = cp; _ return 1; _ } _ _ int _ try(a, b, c) _ short a, b, c; _ { _ short int i; _ int collisions; _ _ collisions = 0; _ for (i = 0; i < hashsize; ++i) _ kwhash[i] = NULL; _ for (i = 0; i < KW_NUMKEYS; ++i) _ if (!insert(keywords[i], a, b, c)) _ ++collisions; _ return collisions; _ } _ _ main(argc, argv) _ int argc; _ char **argv; _ { _ int min_collisions; _ int min_abc = 0; _ short a, b, c; _ _ if (argc > 1) hashsize = atoi(argv[1]); _ else _ { _ printf("usage: %s \n\t should be prime.\n", _ argv[0]); _ exit(-1); _ } _ _ if (hashsize < KW_NUMKEYS) _ { _ printf("Hash table is too small.\n"); _ exit(-1); _ } _ _ kwhash = (char**) malloc(hashsize * sizeof(char*)); _ min_collisions = hashsize + 1; _ for (a = 0; a <= 10; ++a) _ { _ for (b = 0; b <= 10; ++b) _ { _ for (c = 0; c <= 10; ++c) _ { _ int collisions; _ _ collisions = try(a, b, c); _ if (collisions <= min_collisions) _ { _ printf("abc: %03x Collisions: %2d ", _ ((a<<8)|(b<<4)|c), collisions); _ min_collisions = collisions; _ if (collisions == 0) putchar('*'); _ putchar('\n'); _ } _ } _ } _ } _ } %%%EOF%%% echo "x - clex_test.c" sed -e 's/^_ //' >clex_test.c <<'%%%EOF%%%' _ _ // clex_test -- test code for clex.o _ _ #include "clex.h" _ _ main() _ { _ Clex cl = Clex(stdin, TRUE); _ Clex_sym s; _ do { _ s = cl.next(); _ printf("%5D ", cl.line_no()); _ if (s >= KEYWORD_S) _ printf(" %s\n", cl.debug(s)); _ else if (s == IDENT_S || _ s == NUM_S || _ s == FLOATNUM_S || _ s == LBRACK_S || _ s == APOS_S || _ s == QUOTE_S ) _ printf( " %s \"%s\"\n", cl.debug(s), cl.str()); _ else _ printf( " %s\n", cl.debug(s)); _ } while (s > EOF_S); _ _ exit(0); _ } %%%EOF%%% echo "Done." exit 0