zephyr/zwgc/lexer.c

   1 /* This file is part of the Project Athena Zephyr Notification System.
   2  * It is one of the source files comprising zwgc, the Zephyr WindowGram
   3  * client.
   4  *
   5  *      Created by:     Marc Horowitz <marc@athena.mit.edu>
   6  *
   7  *      $Id$
   8  *
   9  *      Copyright (c) 1989 by the Massachusetts Institute of Technology.
  10  *      For copying and distribution information, see the file
  11  *      "mit-copyright.h".
  12  */
  13
  14 #include <sysdep.h>
  15
  16 #if (!defined(lint) && !defined(SABER))
  17 static const char rcsid_lexer_c[] = "$Id$";
  18 #endif
  19
  20 #include <zephyr/mit-copyright.h>
  21
  22 /****************************************************************************/
  23 /*                                                                          */
  24 /*               The lexer for the zwgc description language:               */
  25 /*                                                                          */
  26 /****************************************************************************/
  27
  28 #include "new_memory.h"
  29 #include "new_string.h"
  30 #include "int_dictionary.h"
  31 #include "lexer.h"
  32 #include "parser.h"
  33 #include "y.tab.h"
  34
  35 /*
  36  * yylineno - this holds the current line # we are on.  Updated automatically
  37  *            by input() and unput().
  38  */
  39
  40 int yylineno;
  41
  42 /*
  43  * keyword_dict - this dictionary maps keyword names to their token numbers.
  44  */
  45
  46 static int_dictionary keyword_dict = NULL;
  47
  48 /****************************************************************************/
  49 /*                                                                          */
  50 /*                               I/O functions:                             */
  51 /*                                                                          */
  52 /****************************************************************************/
  53
  54 /*
  55  * input_file - this holds the FILE pointer to the file currently being lexed.
  56  */
  57
  58 static FILE *input_file;
  59
  60 /*
  61  * pushback - if not -1, holds a character that was pushed back by unput but
  62  *            not yet read by input.
  63  */
  64
  65 static int pushback = -1;
  66
  67 static char input()
  68 {
  69     int c;
  70
  71     if (pushback != -1) {
  72         c = pushback;
  73         pushback = -1;
  74         if (c=='\n')
  75           yylineno++;
  76         return(c);
  77     }
  78
  79     c = getc(input_file);
  80     if (c=='\n')
  81       yylineno++;
  82     if (c==EOF)
  83       c = 0;
  84
  85     return(c);
  86 }
  87
  88 static void unput(c)
  89      int c;
  90 {
  91 #ifdef DEBUG
  92     if (pushback != -1) {
  93         printf("Attempt to push back 2 characters at one time!\n");
  94         exit(1);
  95     }
  96 #endif
  97
  98     pushback = c;
  99     if (c == '\n')
 100       yylineno--;
 101 }
 102
 103 /****************************************************************************/
 104 /*                                                                          */
 105 /*                           Initialization routines:                       */
 106 /*                                                                          */
 107 /****************************************************************************/
 108
 109 struct keyword_info {
 110     string keyword;
 111     int keyword_number;
 112 };
 113
 114 /*
 115  * keywords - This table holds a copy of the mapping from keyword name to
 116  *            token number and is used to initialize keyword_dict:
 117  */
 118
 119 static struct keyword_info keywords[] =   {
 120                    { "and", '&' },
 121                    { "appendport", APPENDPORT },
 122                    { "buffer", BUFFER },
 123                    { "break", BREAK },
 124                    { "closeinput", CLOSEINPUT },
 125                    { "closeoutput", CLOSEOUTPUT },
 126                    { "closeport", CLOSEPORT },
 127                    { "case", CASE },
 128                    { "clearbuf", CLEARBUF },
 129                    { "default", DEFAULT },
 130                    { "do", DO },
 131                    { "downcase", DOWNCASE },
 132                    { "else", ELSE },
 133                    { "elseif", ELSEIF },
 134                    { "endcase", ENDCASE },
 135                    { "endif", ENDIF },
 136                    { "endwhile", ENDWHILE },
 137                    { "exec", EXEC },
 138                    { "execport", EXECPORT },
 139                    { "exit", EXIT },
 140                    { "fields", FIELDS },
 141                    { "get", GET },
 142                    { "getenv", GETENV },
 143                    { "if", IF },
 144                    { "inputport", INPUTPORT },
 145                    { "lany", LANY },
 146                    { "lbreak", LBREAK },
 147                    { "lspan", LSPAN },
 148                    { "match", MATCH },
 149                    { "noop", NOOP },
 150                    { "not", '!' },
 151                    { "or", '|' },
 152                    { "outputport", OUTPUTPORT },
 153                    { "print", PRINT },
 154                    { "protect", PROTECT },
 155                    { "put", PUT },
 156                    { "rany", RANY },
 157                    { "rbreak", RBREAK },
 158                    { "rspan", RSPAN },
 159                    { "set", SET },
 160                    { "show", SHOW },
 161                    { "stylestrip", STYLESTRIP },
 162                    { "substitute", SUBSTITUTE },
 163                    { "then", THEN },
 164                    { "upcase", UPCASE },
 165                    { "while", WHILE },
 166                    { "verbatim", VERBATIM },
 167                    { "zvar", ZVAR } };
 168
 169 /*
 170  * lex_open - this routine [re]initializes the lexer & prepares it to lex
 171  *            a file.  Resets current line # to 1.
 172  */
 173
 174 void lex_open(file)
 175      FILE *file;
 176 {
 177     /*
 178      * Initialize I/O:
 179      */
 180     input_file = file;
 181     yylineno = 1;
 182     pushback = -1;
 183
 184     /*
 185      * Initialize keyword_dict from keywords if needed:
 186      */
 187     if (!keyword_dict) {
 188         int i;
 189
 190         keyword_dict = int_dictionary_Create(101);
 191
 192         for (i=0; i<sizeof(keywords)/sizeof(struct keyword_info); i++)
 193           int_dictionary_Define(keyword_dict, keywords[i].keyword,
 194                                 0)->value = keywords[i].keyword_number;
 195     }
 196 }
 197
 198 /****************************************************************************/
 199 /*                                                                          */
 200 /*                            lex subroutines:                              */
 201 /*                                                                          */
 202 /****************************************************************************/
 203
 204 /*
 205  * eat_escape_code - this rountine eats an escape code & returns the character
 206  *                   it codes for or 0 if it codes for "".
 207  *                   (an escape code is what follows a '\\' in a quoted
 208  *                   string)  Current escape codes are:
 209  *
 210  *                       "n"          == '\n'
 211  *                       "t"          == '\t'
 212  *                       "b"          == '\b'
 213  *                       "\n"         == "" (i.e., returns 0)
 214  *                       <EOF>        == ""
 215  *                       [0-7]{1,3}   == the character represented by the code
 216  *                                       interpreted as an octal number.
 217  *                       [^ntb0-7\n]  == the same character.  I.e., "*" == '*'
 218  */
 219
 220 #define  is_octal_digit(c)           (((c)>='0') && ((c)<='7'))
 221
 222 static char eat_escape_code()
 223 {
 224     int c, coded_char;
 225
 226     c = input();
 227
 228     switch (c) {
 229       case 0:  /* i.e., EOF */
 230         unput(c);
 231         return(c);
 232       case '\n':
 233         return(0);
 234       case 'n':
 235         return('\n');
 236       case 't':
 237         return('\t');
 238       case 'b':
 239         return('\b');
 240       case '0':   case '1':   case '2':   case '3':
 241       case '4':   case '5':   case '6':   case '7':
 242         coded_char = c - '0';
 243         c = input();
 244         if (!is_octal_digit(c)) {
 245             unput(c);
 246             return(coded_char);
 247         }
 248         coded_char = coded_char*8 + c-'0';
 249         c = input();
 250         if (!is_octal_digit(c)) {
 251             unput(c);
 252             return(coded_char);
 253         }
 254         return(coded_char*8 + c-'0');
 255       default:
 256         return(c);
 257     }
 258 }
 259
 260 /*
 261  * eat_string - this routine eats characters allowing escape codes via '\\'
 262  *              until a '"' is eaten.  If no '"' is seen before a '\n' or
 263  *              the <EOF>, a parse_error is set & 0 is returned.  Otherwise,
 264  *              the string represented by what has been eaten is returned.
 265  *              I.e., 'hello \n there"' would cause "hello \n there" to be
 266  *              returned.  (thats not a <cr> in the first case, a <cr> in the
 267  *              second)  The returned string is on the heap & must be freed
 268  *              eventually.  This routine should be passed the line # that the
 269  *              string we are eating started on.
 270  */
 271
 272 static char *eat_string(starting_line)
 273      int starting_line;
 274 {
 275     int c;
 276     char buffer[500];
 277     char *ptr = buffer;
 278
 279     for (;;) {
 280         /*
 281          * Get the next input character, handling EOF:
 282          */
 283         c = input();
 284         if (!c) {
 285             unput(c);
 286             report_parse_error("unterminated string found beginning",
 287                             starting_line);
 288             return(0);
 289         }
 290
 291         /*
 292          * Deal with special characters ('\\', '"', and '\n'):
 293          */
 294         if (c=='\\') {
 295             c = eat_escape_code();
 296             if (!c)
 297               continue;
 298         } else if (c == '"') {
 299             *ptr = 0;
 300             return(string_Copy(buffer));
 301         } else if (c == '\n') {
 302             unput(c);        /* fix line # reference to right line # */
 303             report_parse_error("carriage return found in string", yylineno);
 304             return(0);
 305         }
 306
 307         /*
 308          * Add the character c to the current string:
 309          */
 310         *ptr = c;
 311         ptr++;
 312
 313         /*
 314          * If out of buffer space, do a recursive call then
 315          * concatanate the result to the string read in so far to get the
 316          * entire string and return that:
 317          */
 318         if (ptr>buffer+sizeof(buffer)-20) {
 319             string rest_of_string, result;
 320
 321             rest_of_string = eat_string(starting_line);
 322             if (!rest_of_string)
 323               return(0);
 324
 325             *ptr = 0;
 326             result = string_Concat(buffer, rest_of_string);
 327             free(rest_of_string);
 328             return(result);
 329         }
 330     }
 331 }
 332
 333 /*
 334  * eat_show_line - internal routine for eat_show:
 335  *
 336  *        This routine reads in a physical line of text allowing escape
 337  *    codes via '\\'.  If the line ends with a newline, the newline is eaten.
 338  *    If the line ends with a EOF, the EOF is not eaten.  The string
 339  *    represented by what has been eaten is returned.  The returned string
 340  *    is on the heap & must be freed eventually.  If test_for_endshow is
 341  *    true and the line read in starts off with "endshow" exactly
 342  *    (i.e., no escape codes) followed by any non-identifier-char, then
 343  *    instead of doing the above, we just eat the "endshow" & return 0.
 344  */
 345
 346 static char *eat_show_line(test_for_endshow)
 347      int test_for_endshow;
 348 {
 349     int c;
 350     int saw_escape_code = 0;
 351     int starting_line = yylineno;
 352     char buffer[200];      /* This must be large enough to hold "endshow" */
 353     char *ptr = buffer;
 354
 355     while (yylineno == starting_line) {
 356         c = input();
 357         if (!c) {
 358             unput(c);
 359             *ptr = '\0';
 360             return(string_Copy(buffer));
 361         } else if (c == '\\') {
 362             saw_escape_code = 1;
 363             c = eat_escape_code();
 364             if (!c)
 365               continue;
 366         }
 367
 368         *ptr = c;
 369         ptr++;
 370
 371         if ((ptr==buffer+strlen("endshow")) && test_for_endshow)
 372           if (!strncmp(buffer, "endshow", strlen("endshow"))
 373               && !saw_escape_code) {
 374               c = input();
 375               unput(c);
 376               if (!is_identifier_char(c))
 377                 return(0);
 378           }
 379
 380         if (ptr>buffer+sizeof(buffer)-2) {
 381             string the_line;
 382             string rest_of_line = eat_show_line(0);
 383
 384             *ptr = '\0';
 385             the_line = string_Concat(buffer, rest_of_line);
 386             free(rest_of_line);
 387             return(the_line);
 388         }
 389     }
 390
 391     *ptr = '\0';
 392     return(string_Copy(buffer));
 393 }
 394
 395 /*
 396  * eat_til_endshow - this routine eats characters allowing escape codes via
 397  *                   '\\' up to a endshow\{nonalpha} found at the
 398  *                   start of a line not counting leading whitespace.
 399  *                   If <EOF> is seen before the terminator, a parse_error
 400  *                   is set & 0 returned.  Otherwise, the string represented
 401  *                   by what has been eaten (escape codes replaced by what
 402  *                   they stand for and leading spaces and tabs removed from
 403  *                   each physical line) is returned.  The returned string
 404  *                   is on the heap & must be freed eventually.  Note that
 405  *                   to embed endshow in a message, endsho\w can be used.
 406  *                   This routine should be passed the line # of the show
 407  *                   command it is being used to process for use in error
 408  *                   messages.
 409  */
 410
 411 static char *eat_til_endshow(start_line_no)
 412      int start_line_no;
 413 {
 414     register int c;
 415     string text_so_far = string_Copy("");
 416     string next_line;
 417
 418     for (;;) {
 419         /*
 420          * Skip the spaces & tabs at the start of the current line:
 421          */
 422         while ((c=input()), c==' ' || c=='\t') ;
 423         unput(c);
 424
 425         /*
 426          * Handle unterminated shows:
 427          */
 428         if (!c) {
 429             report_parse_error("unterminated show beginning", start_line_no);
 430             free(text_so_far);
 431             return(0);
 432         }
 433
 434         /*
 435          * Read in rest of the line (including the <cr> at end), allowing
 436          * for escape codes and checking for "endshow{nonalpha}" at the
 437          * start of the line.  (Note: \<newline> is considered the
 438          * end of a line here!)
 439          */
 440         next_line = eat_show_line(1);
 441
 442         if (!next_line)  /* i.e., is this the endshow line? */
 443           return(text_so_far);
 444
 445         text_so_far = string_Concat2(text_so_far, next_line);
 446         free(next_line);
 447     }
 448 }
 449
 450 /*
 451  * handle_show - this routine is called after "show"\{nonalpha} is
 452  *               found to handle up to the endshow.  The token # is
 453  *               returned.
 454  */
 455
 456 static int handle_show()
 457 {
 458     int c;
 459     int start_line_no = yylineno;
 460
 461     /*
 462      * Eat up ' ' and '\t's after show.  If the next character is a newline,
 463      * eat it.  This is so we don't get an extra newline when we call
 464      * eat_til_endshow:
 465      */
 466     while (c=input(), c==' ' || c=='\t') ;
 467     if (c!='\n')
 468       unput(c);
 469
 470     if (yylval.text = eat_til_endshow(start_line_no))
 471       return(SHOW);
 472     else
 473       return(ERROR);
 474 }
 475
 476 /****************************************************************************/
 477 /*                                                                          */
 478 /*                         The main lexer itself:                           */
 479 /*                                                                          */
 480 /****************************************************************************/
 481
 482 /*
 483  * yylex - performs as per. the yacc manual's requirements
 484  */
 485
 486 int yylex()
 487 {
 488     register int c, last_char;
 489     register char *ptr;
 490     int start_line_no;
 491     int_dictionary_binding *binding;
 492     char varname[MAX_IDENTIFIER_LENGTH+1];
 493
 494     for (;;) {
 495         switch (c = input()) {
 496
 497             /*
 498              * Skip whitespace:
 499              */
 500           case ' ':   case '\t':   case '\n':
 501             continue;
 502
 503             /*
 504              * '#' comments out everything up to the and including
 505              * the next <cr>:
 506              */
 507           case '#':
 508             while ( (c=input()) && (c!='\n') ) ;
 509             if (!c)
 510               unput(c);
 511             continue;
 512
 513             /*
 514              * Handle c-style comments.  Note that "/[^*]" is not the start
 515              * of any valid token.
 516              */
 517           case '/':
 518             start_line_no = yylineno;
 519
 520             /* verify that next character is a '*': */
 521             if ((c=input()) != '*')
 522               return(ERROR);
 523
 524             /* Scan until "*\/" or <EOF>: */
 525             for (last_char=0; ; last_char=c) {
 526                 c = input();
 527                 if (c == '/' && (last_char=='*'))
 528                   break;
 529                 if (!c) {
 530                     unput(c);
 531                     report_parse_error("unterminated c style comment found beginning", start_line_no);
 532                     return(ERROR);
 533                 }
 534             }
 535             continue;
 536
 537             /*
 538              * The following characters lex as themselves:
 539              *   '+', '|', '&', '(', ')', '.', ',' and <EOF>:
 540              */
 541           case   0:   case '+':   case '|':   case '&':   case '(':
 542           case ')':   case '.':   case ',':
 543             return(c);
 544
 545             /*
 546              * Handle "=[^~=]", "=~", and "==":
 547              */
 548           case '=':
 549             switch (c = input()) {
 550               case '~':
 551                 return(REGEQ);
 552               case '=':
 553                 return(EQ);
 554               default:
 555                 unput(c);
 556                 return('=');
 557             }
 558
 559             /*
 560              * Handle "![^~=]", "!~", and "!=":
 561              */
 562           case '!':
 563             switch (c = input()) {
 564               case '~':
 565                 return(REGNEQ);
 566               case '=':
 567                 return(NEQ);
 568               default:
 569                 unput(c);
 570                 return('!');
 571             }
 572
 573             /*
 574              * Handle identifiers and keywords:
 575              *
 576              * Note that the below set of characters is hard coded from
 577              * is_identifier_char from parser.h.
 578              */
 579           case 'a':   case 'b':   case 'c':   case 'd':   case 'e':
 580           case 'f':   case 'g':   case 'h':   case 'i':   case 'j':
 581           case 'k':   case 'l':   case 'm':   case 'n':   case 'o':
 582           case 'p':   case 'q':   case 'r':   case 's':   case 't':
 583           case 'u':   case 'v':   case 'w':   case 'x':   case 'y':
 584           case 'z':
 585           case 'A':   case 'B':   case 'C':   case 'D':   case 'E':
 586           case 'F':   case 'G':   case 'H':   case 'I':   case 'J':
 587           case 'K':   case 'L':   case 'M':   case 'N':   case 'O':
 588           case 'P':   case 'Q':   case 'R':   case 'S':   case 'T':
 589           case 'U':   case 'V':   case 'W':   case 'X':   case 'Y':
 590           case 'Z':
 591           case '0':   case '1':   case '2':   case '3':   case '4':
 592           case '5':   case '6':   case '7':   case '8':   case '9':
 593           case '_':
 594             /*
 595              * Read in the first MAX_IDENTIFIER_LENGTH characters of the
 596              * identifier into varname null terminated.  Eat
 597              * the rest of the characters of the identifier:
 598              */
 599             for (ptr = varname;;) {
 600                 if (ptr<varname+MAX_IDENTIFIER_LENGTH)
 601                   *(ptr++) = c;
 602                 c = input();
 603                 if (!is_identifier_char(c))
 604                   break;
 605             }
 606             unput(c);
 607             *ptr = '\0';
 608
 609             /*
 610              * Look up the identifier in the keyword dictionary.
 611              * If its a match, return the keyword's #.  In the case
 612              * of show, call handle_show to do more processing.
 613              * If not a match, treat as a variable name.
 614              */
 615             binding = int_dictionary_Lookup(keyword_dict, varname);
 616             if (!binding) {
 617                 yylval.text = string_Copy(varname);
 618                 return(VARNAME);
 619             }
 620             if (binding->value == SHOW)
 621               return(handle_show());
 622             else
 623               return(binding->value);
 624
 625             /*
 626              * Handle "${identifier}".  Note that $ followed by a
 627              * non-identifier character is not the start of any valid token.
 628              */
 629           case '$':
 630             c = input();
 631             if (!is_identifier_char(c))
 632               return(ERROR);
 633
 634             /*
 635              * Read in the first MAX_IDENTIFIER_LENGTH characters of the
 636              * identifier into varname null terminated.  Eat
 637              * the rest of the characters of the identifier:
 638              */
 639             for (ptr = varname;;) {
 640                 if (ptr<varname+MAX_IDENTIFIER_LENGTH)
 641                   *(ptr++) = c;
 642                 c = input();
 643                 if (!is_identifier_char(c))
 644                   break;
 645             }
 646             unput(c);
 647             *ptr = '\0';
 648
 649             yylval.text = string_Copy(varname);
 650             return(VARREF);
 651
 652             /*
 653              * Handle constant strings:
 654              */
 655           case '"':
 656             if (yylval.text = eat_string(yylineno))
 657               return(STRING);
 658             else
 659               return(ERROR);
 660
 661             /*
 662              * All other characters do not start valid tokens:
 663              */
 664           default:
 665             return(ERROR);
 666         }
 667     }
 668 }