zephyr/zwgc/lexer.c

   1 /* This file is part of the Project Athena Zephyr Notification System.
   2  * It is one of the source files comprising zwgc, the Zephyr WindowGram
   3  * client.
   4  *
   5  *      Created by:     Marc Horowitz <marc@athena.mit.edu>
   6  *
   7  *      $Id: lexer.c 2144 2008-01-21 07:57:32Z kcr $
   8  *
   9  *      Copyright (c) 1989 by the Massachusetts Institute of Technology.
  10  *      For copying and distribution information, see the file
  11  *      "mit-copyright.h".
  12  */
  13
  14 #include <sysdep.h>
  15
  16 #if (!defined(lint) && !defined(SABER))
  17 static const char rcsid_lexer_c[] = "$Id: lexer.c 2144 2008-01-21 07:57:32Z kcr $";
  18 #endif
  19
  20 #include <zephyr/mit-copyright.h>
  21
  22 /****************************************************************************/
  23 /*                                                                          */
  24 /*               The lexer for the zwgc description language:               */
  25 /*                                                                          */
  26 /****************************************************************************/
  27
  28 #include "new_memory.h"
  29 #include "new_string.h"
  30 #include "int_dictionary.h"
  31 #include "lexer.h"
  32 #include "parser.h"
  33 #include "y.tab.h"
  34
  35 /*
  36  * yylineno - this holds the current line # we are on.  Updated automatically
  37  *            by input() and unput().
  38  */
  39
  40 int yylineno;
  41
  42 /*
  43  * keyword_dict - this dictionary maps keyword names to their token numbers.
  44  */
  45
  46 static int_dictionary keyword_dict = NULL;
  47
  48 /****************************************************************************/
  49 /*                                                                          */
  50 /*                               I/O functions:                             */
  51 /*                                                                          */
  52 /****************************************************************************/
  53
  54 /*
  55  * input_file - this holds the FILE pointer to the file currently being lexed.
  56  */
  57
  58 static FILE *input_file;
  59
  60 /*
  61  * pushback - if not -1, holds a character that was pushed back by unput but
  62  *            not yet read by input.
  63  */
  64
  65 static int pushback = -1;
  66
  67 static char
  68 input(void)
  69 {
  70     int c;
  71
  72     if (pushback != -1) {
  73         c = pushback;
  74         pushback = -1;
  75         if (c=='\n')
  76           yylineno++;
  77         return(c);
  78     }
  79
  80     c = getc(input_file);
  81     if (c=='\n')
  82       yylineno++;
  83     if (c==EOF)
  84       c = 0;
  85
  86     return(c);
  87 }
  88
  89 static void
  90 unput(int c)
  91 {
  92 #ifdef DEBUG
  93     if (pushback != -1) {
  94         printf("Attempt to push back 2 characters at one time!\n");
  95         exit(1);
  96     }
  97 #endif
  98
  99     pushback = c;
 100     if (c == '\n')
 101       yylineno--;
 102 }
 103
 104 /****************************************************************************/
 105 /*                                                                          */
 106 /*                           Initialization routines:                       */
 107 /*                                                                          */
 108 /****************************************************************************/
 109
 110 struct keyword_info {
 111     string keyword;
 112     int keyword_number;
 113 };
 114
 115 /*
 116  * keywords - This table holds a copy of the mapping from keyword name to
 117  *            token number and is used to initialize keyword_dict:
 118  */
 119
 120 static struct keyword_info keywords[] =   {
 121                    { "and", '&' },
 122                    { "appendport", APPENDPORT },
 123                    { "buffer", BUFFER },
 124                    { "break", BREAK },
 125                    { "closeinput", CLOSEINPUT },
 126                    { "closeoutput", CLOSEOUTPUT },
 127                    { "closeport", CLOSEPORT },
 128                    { "case", CASE },
 129                    { "clearbuf", CLEARBUF },
 130                    { "default", DEFAULT },
 131                    { "do", DO },
 132                    { "downcase", DOWNCASE },
 133                    { "else", ELSE },
 134                    { "elseif", ELSEIF },
 135                    { "endcase", ENDCASE },
 136                    { "endif", ENDIF },
 137                    { "endwhile", ENDWHILE },
 138                    { "exec", EXEC },
 139                    { "execport", EXECPORT },
 140                    { "exit", EXIT },
 141                    { "fields", FIELDS },
 142                    { "get", GET },
 143                    { "getenv", GETENV },
 144                    { "if", IF },
 145                    { "inputport", INPUTPORT },
 146                    { "lany", LANY },
 147                    { "lbreak", LBREAK },
 148                    { "lspan", LSPAN },
 149                    { "match", MATCH },
 150                    { "noop", NOOP },
 151                    { "not", '!' },
 152                    { "or", '|' },
 153                    { "outputport", OUTPUTPORT },
 154                    { "print", PRINT },
 155                    { "protect", PROTECT },
 156                    { "put", PUT },
 157                    { "rany", RANY },
 158                    { "rbreak", RBREAK },
 159                    { "rspan", RSPAN },
 160                    { "set", SET },
 161                    { "show", SHOW },
 162                    { "stylestrip", STYLESTRIP },
 163                    { "substitute", SUBSTITUTE },
 164                    { "then", THEN },
 165                    { "upcase", UPCASE },
 166                    { "while", WHILE },
 167                    { "verbatim", VERBATIM },
 168                    { "zvar", ZVAR } };
 169
 170 /*
 171  * lex_open - this routine [re]initializes the lexer & prepares it to lex
 172  *            a file.  Resets current line # to 1.
 173  */
 174
 175 void
 176 lex_open(FILE *file)
 177 {
 178     /*
 179      * Initialize I/O:
 180      */
 181     input_file = file;
 182     yylineno = 1;
 183     pushback = -1;
 184
 185     /*
 186      * Initialize keyword_dict from keywords if needed:
 187      */
 188     if (!keyword_dict) {
 189         int i;
 190
 191         keyword_dict = int_dictionary_Create(101);
 192
 193         for (i=0; i<sizeof(keywords)/sizeof(struct keyword_info); i++)
 194           int_dictionary_Define(keyword_dict, keywords[i].keyword,
 195                                 0)->value = keywords[i].keyword_number;
 196     }
 197 }
 198
 199 /****************************************************************************/
 200 /*                                                                          */
 201 /*                            lex subroutines:                              */
 202 /*                                                                          */
 203 /****************************************************************************/
 204
 205 /*
 206  * eat_escape_code - this rountine eats an escape code & returns the character
 207  *                   it codes for or 0 if it codes for "".
 208  *                   (an escape code is what follows a '\\' in a quoted
 209  *                   string)  Current escape codes are:
 210  *
 211  *                       "n"          == '\n'
 212  *                       "t"          == '\t'
 213  *                       "b"          == '\b'
 214  *                       "\n"         == "" (i.e., returns 0)
 215  *                       <EOF>        == ""
 216  *                       [0-7]{1,3}   == the character represented by the code
 217  *                                       interpreted as an octal number.
 218  *                       [^ntb0-7\n]  == the same character.  I.e., "*" == '*'
 219  */
 220
 221 #define  is_octal_digit(c)           (((c)>='0') && ((c)<='7'))
 222
 223 static char
 224 eat_escape_code(void)
 225 {
 226     int c, coded_char;
 227
 228     c = input();
 229
 230     switch (c) {
 231       case 0:  /* i.e., EOF */
 232         unput(c);
 233         return(c);
 234       case '\n':
 235         return(0);
 236       case 'n':
 237         return('\n');
 238       case 't':
 239         return('\t');
 240       case 'b':
 241         return('\b');
 242       case '0':   case '1':   case '2':   case '3':
 243       case '4':   case '5':   case '6':   case '7':
 244         coded_char = c - '0';
 245         c = input();
 246         if (!is_octal_digit(c)) {
 247             unput(c);
 248             return(coded_char);
 249         }
 250         coded_char = coded_char*8 + c-'0';
 251         c = input();
 252         if (!is_octal_digit(c)) {
 253             unput(c);
 254             return(coded_char);
 255         }
 256         return(coded_char*8 + c-'0');
 257       default:
 258         return(c);
 259     }
 260 }
 261
 262 /*
 263  * eat_string - this routine eats characters allowing escape codes via '\\'
 264  *              until a '"' is eaten.  If no '"' is seen before a '\n' or
 265  *              the <EOF>, a parse_error is set & 0 is returned.  Otherwise,
 266  *              the string represented by what has been eaten is returned.
 267  *              I.e., 'hello \n there"' would cause "hello \n there" to be
 268  *              returned.  (thats not a <cr> in the first case, a <cr> in the
 269  *              second)  The returned string is on the heap & must be freed
 270  *              eventually.  This routine should be passed the line # that the
 271  *              string we are eating started on.
 272  */
 273
 274 static char *
 275 eat_string(int starting_line)
 276 {
 277     int c;
 278     char buffer[500];
 279     char *ptr = buffer;
 280
 281     for (;;) {
 282         /*
 283          * Get the next input character, handling EOF:
 284          */
 285         c = input();
 286         if (!c) {
 287             unput(c);
 288             report_parse_error("unterminated string found beginning",
 289                             starting_line);
 290             return(0);
 291         }
 292
 293         /*
 294          * Deal with special characters ('\\', '"', and '\n'):
 295          */
 296         if (c=='\\') {
 297             c = eat_escape_code();
 298             if (!c)
 299               continue;
 300         } else if (c == '"') {
 301             *ptr = 0;
 302             return(string_Copy(buffer));
 303         } else if (c == '\n') {
 304             unput(c);        /* fix line # reference to right line # */
 305             report_parse_error("carriage return found in string", yylineno);
 306             return(0);
 307         }
 308
 309         /*
 310          * Add the character c to the current string:
 311          */
 312         *ptr = c;
 313         ptr++;
 314
 315         /*
 316          * If out of buffer space, do a recursive call then
 317          * concatanate the result to the string read in so far to get the
 318          * entire string and return that:
 319          */
 320         if (ptr>buffer+sizeof(buffer)-20) {
 321             string rest_of_string, result;
 322
 323             rest_of_string = eat_string(starting_line);
 324             if (!rest_of_string)
 325               return(0);
 326
 327             *ptr = 0;
 328             result = string_Concat(buffer, rest_of_string);
 329             free(rest_of_string);
 330             return(result);
 331         }
 332     }
 333 }
 334
 335 /*
 336  * eat_show_line - internal routine for eat_show:
 337  *
 338  *        This routine reads in a physical line of text allowing escape
 339  *    codes via '\\'.  If the line ends with a newline, the newline is eaten.
 340  *    If the line ends with a EOF, the EOF is not eaten.  The string
 341  *    represented by what has been eaten is returned.  The returned string
 342  *    is on the heap & must be freed eventually.  If test_for_endshow is
 343  *    true and the line read in starts off with "endshow" exactly
 344  *    (i.e., no escape codes) followed by any non-identifier-char, then
 345  *    instead of doing the above, we just eat the "endshow" & return 0.
 346  */
 347
 348 static char *
 349 eat_show_line(int test_for_endshow)
 350 {
 351     int c;
 352     int saw_escape_code = 0;
 353     int starting_line = yylineno;
 354     char buffer[200];      /* This must be large enough to hold "endshow" */
 355     char *ptr = buffer;
 356
 357     while (yylineno == starting_line) {
 358         c = input();
 359         if (!c) {
 360             unput(c);
 361             *ptr = '\0';
 362             return(string_Copy(buffer));
 363         } else if (c == '\\') {
 364             saw_escape_code = 1;
 365             c = eat_escape_code();
 366             if (!c)
 367               continue;
 368         }
 369
 370         *ptr = c;
 371         ptr++;
 372
 373         if ((ptr==buffer+strlen("endshow")) && test_for_endshow)
 374           if (!strncmp(buffer, "endshow", strlen("endshow"))
 375               && !saw_escape_code) {
 376               c = input();
 377               unput(c);
 378               if (!is_identifier_char(c))
 379                 return(0);
 380           }
 381
 382         if (ptr>buffer+sizeof(buffer)-2) {
 383             string the_line;
 384             string rest_of_line = eat_show_line(0);
 385
 386             *ptr = '\0';
 387             the_line = string_Concat(buffer, rest_of_line);
 388             free(rest_of_line);
 389             return(the_line);
 390         }
 391     }
 392
 393     *ptr = '\0';
 394     return(string_Copy(buffer));
 395 }
 396
 397 /*
 398  * eat_til_endshow - this routine eats characters allowing escape codes via
 399  *                   '\\' up to a endshow\{nonalpha} found at the
 400  *                   start of a line not counting leading whitespace.
 401  *                   If <EOF> is seen before the terminator, a parse_error
 402  *                   is set & 0 returned.  Otherwise, the string represented
 403  *                   by what has been eaten (escape codes replaced by what
 404  *                   they stand for and leading spaces and tabs removed from
 405  *                   each physical line) is returned.  The returned string
 406  *                   is on the heap & must be freed eventually.  Note that
 407  *                   to embed endshow in a message, endsho\w can be used.
 408  *                   This routine should be passed the line # of the show
 409  *                   command it is being used to process for use in error
 410  *                   messages.
 411  */
 412
 413 static char *
 414 eat_til_endshow(int start_line_no)
 415 {
 416     register int c;
 417     string text_so_far = string_Copy("");
 418     string next_line;
 419
 420     for (;;) {
 421         /*
 422          * Skip the spaces & tabs at the start of the current line:
 423          */
 424         while ((c=input()), c==' ' || c=='\t') ;
 425         unput(c);
 426
 427         /*
 428          * Handle unterminated shows:
 429          */
 430         if (!c) {
 431             report_parse_error("unterminated show beginning", start_line_no);
 432             free(text_so_far);
 433             return(0);
 434         }
 435
 436         /*
 437          * Read in rest of the line (including the <cr> at end), allowing
 438          * for escape codes and checking for "endshow{nonalpha}" at the
 439          * start of the line.  (Note: \<newline> is considered the
 440          * end of a line here!)
 441          */
 442         next_line = eat_show_line(1);
 443
 444         if (!next_line)  /* i.e., is this the endshow line? */
 445           return(text_so_far);
 446
 447         text_so_far = string_Concat2(text_so_far, next_line);
 448         free(next_line);
 449     }
 450 }
 451
 452 /*
 453  * handle_show - this routine is called after "show"\{nonalpha} is
 454  *               found to handle up to the endshow.  The token # is
 455  *               returned.
 456  */
 457
 458 static int
 459 handle_show(void)
 460 {
 461     int c;
 462     int start_line_no = yylineno;
 463
 464     /*
 465      * Eat up ' ' and '\t's after show.  If the next character is a newline,
 466      * eat it.  This is so we don't get an extra newline when we call
 467      * eat_til_endshow:
 468      */
 469     while (c=input(), c==' ' || c=='\t') ;
 470     if (c!='\n')
 471       unput(c);
 472
 473     yylval.text = eat_til_endshow(start_line_no);
 474     if (yylval.text)
 475       return(SHOW);
 476     else
 477       return(ERROR);
 478 }
 479
 480 /****************************************************************************/
 481 /*                                                                          */
 482 /*                         The main lexer itself:                           */
 483 /*                                                                          */
 484 /****************************************************************************/
 485
 486 /*
 487  * yylex - performs as per. the yacc manual's requirements
 488  */
 489
 490 int yylex(void)
 491 {
 492     register int c, last_char;
 493     register char *ptr;
 494     int start_line_no;
 495     int_dictionary_binding *binding;
 496     char varname[MAX_IDENTIFIER_LENGTH+1];
 497
 498     for (;;) {
 499         switch (c = input()) {
 500
 501             /*
 502              * Skip whitespace:
 503              */
 504           case ' ':   case '\t':   case '\n':
 505             continue;
 506
 507             /*
 508              * '#' comments out everything up to the and including
 509              * the next <cr>:
 510              */
 511           case '#':
 512             while ( (c=input()) && (c!='\n') ) ;
 513             if (!c)
 514               unput(c);
 515             continue;
 516
 517             /*
 518              * Handle c-style comments.  Note that "/[^*]" is not the start
 519              * of any valid token.
 520              */
 521           case '/':
 522             start_line_no = yylineno;
 523
 524             /* verify that next character is a '*': */
 525             if ((c=input()) != '*')
 526               return(ERROR);
 527
 528             /* Scan until "*\/" or <EOF>: */
 529             for (last_char=0; ; last_char=c) {
 530                 c = input();
 531                 if (c == '/' && (last_char=='*'))
 532                   break;
 533                 if (!c) {
 534                     unput(c);
 535                     report_parse_error("unterminated c style comment found beginning", start_line_no);
 536                     return(ERROR);
 537                 }
 538             }
 539             continue;
 540
 541             /*
 542              * The following characters lex as themselves:
 543              *   '+', '|', '&', '(', ')', '.', ',' and <EOF>:
 544              */
 545           case   0:   case '+':   case '|':   case '&':   case '(':
 546           case ')':   case '.':   case ',':
 547             return(c);
 548
 549             /*
 550              * Handle "=[^~=]", "=~", and "==":
 551              */
 552           case '=':
 553             switch (c = input()) {
 554               case '~':
 555                 return(REGEQ);
 556               case '=':
 557                 return(EQ);
 558               default:
 559                 unput(c);
 560                 return('=');
 561             }
 562
 563             /*
 564              * Handle "![^~=]", "!~", and "!=":
 565              */
 566           case '!':
 567             switch (c = input()) {
 568               case '~':
 569                 return(REGNEQ);
 570               case '=':
 571                 return(NEQ);
 572               default:
 573                 unput(c);
 574                 return('!');
 575             }
 576
 577             /*
 578              * Handle identifiers and keywords:
 579              *
 580              * Note that the below set of characters is hard coded from
 581              * is_identifier_char from parser.h.
 582              */
 583           case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
 584           case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
 585           case 'k':   case 'l':   case 'm':   case 'n':   case 'o':
 586           case 'p':   case 'q':   case 'r':   case 's':   case 't':
 587           case 'u':   case 'v':   case 'w':   case 'x':   case 'y':
 588           case 'z':
 589           case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
 590           case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
 591           case 'K':   case 'L':   case 'M':   case 'N':   case 'O':
 592           case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
 593           case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
 594           case 'Z':
 595           case '0':   case '1':   case '2':   case '3':   case '4':
 596           case '5':   case '6':   case '7':   case '8':   case '9':
 597           case '_':
 598             /*
 599              * Read in the first MAX_IDENTIFIER_LENGTH characters of the
 600              * identifier into varname null terminated.  Eat
 601              * the rest of the characters of the identifier:
 602              */
 603             for (ptr = varname;;) {
 604                 if (ptr<varname+MAX_IDENTIFIER_LENGTH)
 605                   *(ptr++) = c;
 606                 c = input();
 607                 if (!is_identifier_char(c))
 608                   break;
 609             }
 610             unput(c);
 611             *ptr = '\0';
 612
 613             /*
 614              * Look up the identifier in the keyword dictionary.
 615              * If its a match, return the keyword's #.  In the case
 616              * of show, call handle_show to do more processing.
 617              * If not a match, treat as a variable name.
 618              */
 619             binding = int_dictionary_Lookup(keyword_dict, varname);
 620             if (!binding) {
 621                 yylval.text = string_Copy(varname);
 622                 return(VARNAME);
 623             }
 624             if (binding->value == SHOW)
 625               return(handle_show());
 626             else
 627               return(binding->value);
 628
 629             /*
 630              * Handle "${identifier}".  Note that $ followed by a
 631              * non-identifier character is not the start of any valid token.
 632              */
 633           case '$':
 634             c = input();
 635             if (!is_identifier_char(c))
 636               return(ERROR);
 637
 638             /*
 639              * Read in the first MAX_IDENTIFIER_LENGTH characters of the
 640              * identifier into varname null terminated.  Eat
 641              * the rest of the characters of the identifier:
 642              */
 643             for (ptr = varname;;) {
 644                 if (ptr<varname+MAX_IDENTIFIER_LENGTH)
 645                   *(ptr++) = c;
 646                 c = input();
 647                 if (!is_identifier_char(c))
 648                   break;
 649             }
 650             unput(c);
 651             *ptr = '\0';
 652
 653             yylval.text = string_Copy(varname);
 654             return(VARREF);
 655
 656             /*
 657              * Handle constant strings:
 658              */
 659           case '"':
 660             yylval.text = eat_string(yylineno);
 661             if (yylval.text)
 662               return(STRING);
 663             else
 664               return(ERROR);
 665
 666             /*
 667              * All other characters do not start valid tokens:
 668              */
 669           default:
 670             return(ERROR);
 671         }
 672     }
 673 }