1 /* This file is part of the Project Athena Zephyr Notification System.
2 * It is one of the source files comprising zwgc, the Zephyr WindowGram
5 * Created by: Marc Horowitz <marc@athena.mit.edu>
7 * $Id: lexer.c 2144 2008-01-21 07:57:32Z kcr $
9 * Copyright (c) 1989 by the Massachusetts Institute of Technology.
10 * For copying and distribution information, see the file
16 #if (!defined(lint) && !defined(SABER))
17 static const char rcsid_lexer_c[] = "$Id: lexer.c 2144 2008-01-21 07:57:32Z kcr $";
20 #include <zephyr/mit-copyright.h>
22 /****************************************************************************/
24 /* The lexer for the zwgc description language: */
26 /****************************************************************************/
28 #include "new_memory.h"
29 #include "new_string.h"
30 #include "int_dictionary.h"
36 * yylineno - this holds the current line # we are on. Updated automatically
37 * by input() and unput().
43 * keyword_dict - this dictionary maps keyword names to their token numbers.
46 static int_dictionary keyword_dict = NULL;
48 /****************************************************************************/
52 /****************************************************************************/
55 * input_file - this holds the FILE pointer to the file currently being lexed.
58 static FILE *input_file;
61 * pushback - if not -1, holds a character that was pushed back by unput but
62 * not yet read by input.
65 static int pushback = -1;
94 printf("Attempt to push back 2 characters at one time!\n");
104 /****************************************************************************/
106 /* Initialization routines: */
108 /****************************************************************************/
110 struct keyword_info {
116 * keywords - This table holds a copy of the mapping from keyword name to
117 * token number and is used to initialize keyword_dict:
120 static struct keyword_info keywords[] = {
122 { "appendport", APPENDPORT },
123 { "buffer", BUFFER },
125 { "closeinput", CLOSEINPUT },
126 { "closeoutput", CLOSEOUTPUT },
127 { "closeport", CLOSEPORT },
129 { "clearbuf", CLEARBUF },
130 { "default", DEFAULT },
132 { "downcase", DOWNCASE },
134 { "elseif", ELSEIF },
135 { "endcase", ENDCASE },
137 { "endwhile", ENDWHILE },
139 { "execport", EXECPORT },
141 { "fields", FIELDS },
143 { "getenv", GETENV },
145 { "inputport", INPUTPORT },
147 { "lbreak", LBREAK },
153 { "outputport", OUTPUTPORT },
155 { "protect", PROTECT },
158 { "rbreak", RBREAK },
162 { "stylestrip", STYLESTRIP },
163 { "substitute", SUBSTITUTE },
165 { "upcase", UPCASE },
167 { "verbatim", VERBATIM },
171 * lex_open - this routine [re]initializes the lexer & prepares it to lex
172 * a file. Resets current line # to 1.
186 * Initialize keyword_dict from keywords if needed:
191 keyword_dict = int_dictionary_Create(101);
193 for (i=0; i<sizeof(keywords)/sizeof(struct keyword_info); i++)
194 int_dictionary_Define(keyword_dict, keywords[i].keyword,
195 0)->value = keywords[i].keyword_number;
199 /****************************************************************************/
201 /* lex subroutines: */
203 /****************************************************************************/
206 * eat_escape_code - this rountine eats an escape code & returns the character
207 * it codes for or 0 if it codes for "".
208 * (an escape code is what follows a '\\' in a quoted
209 * string) Current escape codes are:
214 * "\n" == "" (i.e., returns 0)
216 * [0-7]{1,3} == the character represented by the code
217 * interpreted as an octal number.
218 * [^ntb0-7\n] == the same character. I.e., "*" == '*'
221 #define is_octal_digit(c) (((c)>='0') && ((c)<='7'))
224 eat_escape_code(void)
231 case 0: /* i.e., EOF */
242 case '0': case '1': case '2': case '3':
243 case '4': case '5': case '6': case '7':
244 coded_char = c - '0';
246 if (!is_octal_digit(c)) {
250 coded_char = coded_char*8 + c-'0';
252 if (!is_octal_digit(c)) {
256 return(coded_char*8 + c-'0');
263 * eat_string - this routine eats characters allowing escape codes via '\\'
264 * until a '"' is eaten. If no '"' is seen before a '\n' or
265 * the <EOF>, a parse_error is set & 0 is returned. Otherwise,
266 * the string represented by what has been eaten is returned.
267 * I.e., 'hello \n there"' would cause "hello \n there" to be
268 * returned. (thats not a <cr> in the first case, a <cr> in the
269 * second) The returned string is on the heap & must be freed
270 * eventually. This routine should be passed the line # that the
271 * string we are eating started on.
275 eat_string(int starting_line)
283 * Get the next input character, handling EOF:
288 report_parse_error("unterminated string found beginning",
294 * Deal with special characters ('\\', '"', and '\n'):
297 c = eat_escape_code();
300 } else if (c == '"') {
302 return(string_Copy(buffer));
303 } else if (c == '\n') {
304 unput(c); /* fix line # reference to right line # */
305 report_parse_error("carriage return found in string", yylineno);
310 * Add the character c to the current string:
316 * If out of buffer space, do a recursive call then
317 * concatanate the result to the string read in so far to get the
318 * entire string and return that:
320 if (ptr>buffer+sizeof(buffer)-20) {
321 string rest_of_string, result;
323 rest_of_string = eat_string(starting_line);
328 result = string_Concat(buffer, rest_of_string);
329 free(rest_of_string);
336 * eat_show_line - internal routine for eat_show:
338 * This routine reads in a physical line of text allowing escape
339 * codes via '\\'. If the line ends with a newline, the newline is eaten.
340 * If the line ends with a EOF, the EOF is not eaten. The string
341 * represented by what has been eaten is returned. The returned string
342 * is on the heap & must be freed eventually. If test_for_endshow is
343 * true and the line read in starts off with "endshow" exactly
344 * (i.e., no escape codes) followed by any non-identifier-char, then
345 * instead of doing the above, we just eat the "endshow" & return 0.
349 eat_show_line(int test_for_endshow)
352 int saw_escape_code = 0;
353 int starting_line = yylineno;
354 char buffer[200]; /* This must be large enough to hold "endshow" */
357 while (yylineno == starting_line) {
362 return(string_Copy(buffer));
363 } else if (c == '\\') {
365 c = eat_escape_code();
373 if ((ptr==buffer+strlen("endshow")) && test_for_endshow)
374 if (!strncmp(buffer, "endshow", strlen("endshow"))
375 && !saw_escape_code) {
378 if (!is_identifier_char(c))
382 if (ptr>buffer+sizeof(buffer)-2) {
384 string rest_of_line = eat_show_line(0);
387 the_line = string_Concat(buffer, rest_of_line);
394 return(string_Copy(buffer));
398 * eat_til_endshow - this routine eats characters allowing escape codes via
399 * '\\' up to a endshow\{nonalpha} found at the
400 * start of a line not counting leading whitespace.
401 * If <EOF> is seen before the terminator, a parse_error
402 * is set & 0 returned. Otherwise, the string represented
403 * by what has been eaten (escape codes replaced by what
404 * they stand for and leading spaces and tabs removed from
405 * each physical line) is returned. The returned string
406 * is on the heap & must be freed eventually. Note that
407 * to embed endshow in a message, endsho\w can be used.
408 * This routine should be passed the line # of the show
409 * command it is being used to process for use in error
414 eat_til_endshow(int start_line_no)
417 string text_so_far = string_Copy("");
422 * Skip the spaces & tabs at the start of the current line:
424 while ((c=input()), c==' ' || c=='\t') ;
428 * Handle unterminated shows:
431 report_parse_error("unterminated show beginning", start_line_no);
437 * Read in rest of the line (including the <cr> at end), allowing
438 * for escape codes and checking for "endshow{nonalpha}" at the
439 * start of the line. (Note: \<newline> is considered the
440 * end of a line here!)
442 next_line = eat_show_line(1);
444 if (!next_line) /* i.e., is this the endshow line? */
447 text_so_far = string_Concat2(text_so_far, next_line);
453 * handle_show - this routine is called after "show"\{nonalpha} is
454 * found to handle up to the endshow. The token # is
462 int start_line_no = yylineno;
465 * Eat up ' ' and '\t's after show. If the next character is a newline,
466 * eat it. This is so we don't get an extra newline when we call
469 while (c=input(), c==' ' || c=='\t') ;
473 yylval.text = eat_til_endshow(start_line_no);
480 /****************************************************************************/
482 /* The main lexer itself: */
484 /****************************************************************************/
487 * yylex - performs as per. the yacc manual's requirements
492 register int c, last_char;
495 int_dictionary_binding *binding;
496 char varname[MAX_IDENTIFIER_LENGTH+1];
499 switch (c = input()) {
504 case ' ': case '\t': case '\n':
508 * '#' comments out everything up to the and including
512 while ( (c=input()) && (c!='\n') ) ;
518 * Handle c-style comments. Note that "/[^*]" is not the start
519 * of any valid token.
522 start_line_no = yylineno;
524 /* verify that next character is a '*': */
525 if ((c=input()) != '*')
528 /* Scan until "*\/" or <EOF>: */
529 for (last_char=0; ; last_char=c) {
531 if (c == '/' && (last_char=='*'))
535 report_parse_error("unterminated c style comment found beginning", start_line_no);
542 * The following characters lex as themselves:
543 * '+', '|', '&', '(', ')', '.', ',' and <EOF>:
545 case 0: case '+': case '|': case '&': case '(':
546 case ')': case '.': case ',':
550 * Handle "=[^~=]", "=~", and "==":
553 switch (c = input()) {
564 * Handle "![^~=]", "!~", and "!=":
567 switch (c = input()) {
578 * Handle identifiers and keywords:
580 * Note that the below set of characters is hard coded from
581 * is_identifier_char from parser.h.
583 case 'a': case 'b': case 'c': case 'd': case 'e':
584 case 'f': case 'g': case 'h': case 'i': case 'j':
585 case 'k': case 'l': case 'm': case 'n': case 'o':
586 case 'p': case 'q': case 'r': case 's': case 't':
587 case 'u': case 'v': case 'w': case 'x': case 'y':
589 case 'A': case 'B': case 'C': case 'D': case 'E':
590 case 'F': case 'G': case 'H': case 'I': case 'J':
591 case 'K': case 'L': case 'M': case 'N': case 'O':
592 case 'P': case 'Q': case 'R': case 'S': case 'T':
593 case 'U': case 'V': case 'W': case 'X': case 'Y':
595 case '0': case '1': case '2': case '3': case '4':
596 case '5': case '6': case '7': case '8': case '9':
599 * Read in the first MAX_IDENTIFIER_LENGTH characters of the
600 * identifier into varname null terminated. Eat
601 * the rest of the characters of the identifier:
603 for (ptr = varname;;) {
604 if (ptr<varname+MAX_IDENTIFIER_LENGTH)
607 if (!is_identifier_char(c))
614 * Look up the identifier in the keyword dictionary.
615 * If its a match, return the keyword's #. In the case
616 * of show, call handle_show to do more processing.
617 * If not a match, treat as a variable name.
619 binding = int_dictionary_Lookup(keyword_dict, varname);
621 yylval.text = string_Copy(varname);
624 if (binding->value == SHOW)
625 return(handle_show());
627 return(binding->value);
630 * Handle "${identifier}". Note that $ followed by a
631 * non-identifier character is not the start of any valid token.
635 if (!is_identifier_char(c))
639 * Read in the first MAX_IDENTIFIER_LENGTH characters of the
640 * identifier into varname null terminated. Eat
641 * the rest of the characters of the identifier:
643 for (ptr = varname;;) {
644 if (ptr<varname+MAX_IDENTIFIER_LENGTH)
647 if (!is_identifier_char(c))
653 yylval.text = string_Copy(varname);
657 * Handle constant strings:
660 yylval.text = eat_string(yylineno);
667 * All other characters do not start valid tokens: