1 /* This file is part of the Project Athena Zephyr Notification System.
2 * It is one of the source files comprising zwgc, the Zephyr WindowGram
5 * Created by: Marc Horowitz <marc@athena.mit.edu>
9 * Copyright (c) 1989 by the Massachusetts Institute of Technology.
10 * For copying and distribution information, see the file
16 #if (!defined(lint) && !defined(SABER))
17 static const char rcsid_lexer_c[] = "$Id$";
20 #include <zephyr/mit-copyright.h>
22 /****************************************************************************/
24 /* The lexer for the zwgc description language: */
26 /****************************************************************************/
28 #include "new_memory.h"
29 #include "new_string.h"
30 #include "int_dictionary.h"
36 * yylineno - this holds the current line # we are on. Updated automatically
37 * by input() and unput().
43 * keyword_dict - this dictionary maps keyword names to their token numbers.
46 static int_dictionary keyword_dict = NULL;
48 /****************************************************************************/
52 /****************************************************************************/
55 * input_file - this holds the FILE pointer to the file currently being lexed.
58 static FILE *input_file;
61 * pushback - if not -1, holds a character that was pushed back by unput but
62 * not yet read by input.
65 static int pushback = -1;
93 printf("Attempt to push back 2 characters at one time!\n");
103 /****************************************************************************/
105 /* Initialization routines: */
107 /****************************************************************************/
109 struct keyword_info {
115 * keywords - This table holds a copy of the mapping from keyword name to
116 * token number and is used to initialize keyword_dict:
119 static struct keyword_info keywords[] = {
121 { "appendport", APPENDPORT },
122 { "buffer", BUFFER },
124 { "closeinput", CLOSEINPUT },
125 { "closeoutput", CLOSEOUTPUT },
126 { "closeport", CLOSEPORT },
128 { "clearbuf", CLEARBUF },
129 { "default", DEFAULT },
131 { "downcase", DOWNCASE },
133 { "elseif", ELSEIF },
134 { "endcase", ENDCASE },
136 { "endwhile", ENDWHILE },
138 { "execport", EXECPORT },
140 { "fields", FIELDS },
142 { "getenv", GETENV },
144 { "inputport", INPUTPORT },
146 { "lbreak", LBREAK },
152 { "outputport", OUTPUTPORT },
154 { "protect", PROTECT },
157 { "rbreak", RBREAK },
161 { "stylestrip", STYLESTRIP },
162 { "substitute", SUBSTITUTE },
164 { "upcase", UPCASE },
166 { "verbatim", VERBATIM },
170 * lex_open - this routine [re]initializes the lexer & prepares it to lex
171 * a file. Resets current line # to 1.
185 * Initialize keyword_dict from keywords if needed:
190 keyword_dict = int_dictionary_Create(101);
192 for (i=0; i<sizeof(keywords)/sizeof(struct keyword_info); i++)
193 int_dictionary_Define(keyword_dict, keywords[i].keyword,
194 0)->value = keywords[i].keyword_number;
198 /****************************************************************************/
200 /* lex subroutines: */
202 /****************************************************************************/
205 * eat_escape_code - this rountine eats an escape code & returns the character
206 * it codes for or 0 if it codes for "".
207 * (an escape code is what follows a '\\' in a quoted
208 * string) Current escape codes are:
213 * "\n" == "" (i.e., returns 0)
215 * [0-7]{1,3} == the character represented by the code
216 * interpreted as an octal number.
217 * [^ntb0-7\n] == the same character. I.e., "*" == '*'
220 #define is_octal_digit(c) (((c)>='0') && ((c)<='7'))
222 static char eat_escape_code()
229 case 0: /* i.e., EOF */
240 case '0': case '1': case '2': case '3':
241 case '4': case '5': case '6': case '7':
242 coded_char = c - '0';
244 if (!is_octal_digit(c)) {
248 coded_char = coded_char*8 + c-'0';
250 if (!is_octal_digit(c)) {
254 return(coded_char*8 + c-'0');
261 * eat_string - this routine eats characters allowing escape codes via '\\'
262 * until a '"' is eaten. If no '"' is seen before a '\n' or
263 * the <EOF>, a parse_error is set & 0 is returned. Otherwise,
264 * the string represented by what has been eaten is returned.
265 * I.e., 'hello \n there"' would cause "hello \n there" to be
266 * returned. (thats not a <cr> in the first case, a <cr> in the
267 * second) The returned string is on the heap & must be freed
268 * eventually. This routine should be passed the line # that the
269 * string we are eating started on.
272 static char *eat_string(starting_line)
281 * Get the next input character, handling EOF:
286 report_parse_error("unterminated string found beginning",
292 * Deal with special characters ('\\', '"', and '\n'):
295 c = eat_escape_code();
298 } else if (c == '"') {
300 return(string_Copy(buffer));
301 } else if (c == '\n') {
302 unput(c); /* fix line # reference to right line # */
303 report_parse_error("carriage return found in string", yylineno);
308 * Add the character c to the current string:
314 * If out of buffer space, do a recursive call then
315 * concatanate the result to the string read in so far to get the
316 * entire string and return that:
318 if (ptr>buffer+sizeof(buffer)-20) {
319 string rest_of_string, result;
321 rest_of_string = eat_string(starting_line);
326 result = string_Concat(buffer, rest_of_string);
327 free(rest_of_string);
334 * eat_show_line - internal routine for eat_show:
336 * This routine reads in a physical line of text allowing escape
337 * codes via '\\'. If the line ends with a newline, the newline is eaten.
338 * If the line ends with a EOF, the EOF is not eaten. The string
339 * represented by what has been eaten is returned. The returned string
340 * is on the heap & must be freed eventually. If test_for_endshow is
341 * true and the line read in starts off with "endshow" exactly
342 * (i.e., no escape codes) followed by any non-identifier-char, then
343 * instead of doing the above, we just eat the "endshow" & return 0.
346 static char *eat_show_line(test_for_endshow)
347 int test_for_endshow;
350 int saw_escape_code = 0;
351 int starting_line = yylineno;
352 char buffer[200]; /* This must be large enough to hold "endshow" */
355 while (yylineno == starting_line) {
360 return(string_Copy(buffer));
361 } else if (c == '\\') {
363 c = eat_escape_code();
371 if ((ptr==buffer+strlen("endshow")) && test_for_endshow)
372 if (!strncmp(buffer, "endshow", strlen("endshow"))
373 && !saw_escape_code) {
376 if (!is_identifier_char(c))
380 if (ptr>buffer+sizeof(buffer)-2) {
382 string rest_of_line = eat_show_line(0);
385 the_line = string_Concat(buffer, rest_of_line);
392 return(string_Copy(buffer));
396 * eat_til_endshow - this routine eats characters allowing escape codes via
397 * '\\' up to a endshow\{nonalpha} found at the
398 * start of a line not counting leading whitespace.
399 * If <EOF> is seen before the terminator, a parse_error
400 * is set & 0 returned. Otherwise, the string represented
401 * by what has been eaten (escape codes replaced by what
402 * they stand for and leading spaces and tabs removed from
403 * each physical line) is returned. The returned string
404 * is on the heap & must be freed eventually. Note that
405 * to embed endshow in a message, endsho\w can be used.
406 * This routine should be passed the line # of the show
407 * command it is being used to process for use in error
411 static char *eat_til_endshow(start_line_no)
415 string text_so_far = string_Copy("");
420 * Skip the spaces & tabs at the start of the current line:
422 while ((c=input()), c==' ' || c=='\t') ;
426 * Handle unterminated shows:
429 report_parse_error("unterminated show beginning", start_line_no);
435 * Read in rest of the line (including the <cr> at end), allowing
436 * for escape codes and checking for "endshow{nonalpha}" at the
437 * start of the line. (Note: \<newline> is considered the
438 * end of a line here!)
440 next_line = eat_show_line(1);
442 if (!next_line) /* i.e., is this the endshow line? */
445 text_so_far = string_Concat2(text_so_far, next_line);
451 * handle_show - this routine is called after "show"\{nonalpha} is
452 * found to handle up to the endshow. The token # is
456 static int handle_show()
459 int start_line_no = yylineno;
462 * Eat up ' ' and '\t's after show. If the next character is a newline,
463 * eat it. This is so we don't get an extra newline when we call
466 while (c=input(), c==' ' || c=='\t') ;
470 if (yylval.text = eat_til_endshow(start_line_no))
476 /****************************************************************************/
478 /* The main lexer itself: */
480 /****************************************************************************/
483 * yylex - performs as per. the yacc manual's requirements
488 register int c, last_char;
491 int_dictionary_binding *binding;
492 char varname[MAX_IDENTIFIER_LENGTH+1];
495 switch (c = input()) {
500 case ' ': case '\t': case '\n':
504 * '#' comments out everything up to the and including
508 while ( (c=input()) && (c!='\n') ) ;
514 * Handle c-style comments. Note that "/[^*]" is not the start
515 * of any valid token.
518 start_line_no = yylineno;
520 /* verify that next character is a '*': */
521 if ((c=input()) != '*')
524 /* Scan until "*\/" or <EOF>: */
525 for (last_char=0; ; last_char=c) {
527 if (c == '/' && (last_char=='*'))
531 report_parse_error("unterminated c style comment found beginning", start_line_no);
538 * The following characters lex as themselves:
539 * '+', '|', '&', '(', ')', '.', ',' and <EOF>:
541 case 0: case '+': case '|': case '&': case '(':
542 case ')': case '.': case ',':
546 * Handle "=[^~=]", "=~", and "==":
549 switch (c = input()) {
560 * Handle "![^~=]", "!~", and "!=":
563 switch (c = input()) {
574 * Handle identifiers and keywords:
576 * Note that the below set of characters is hard coded from
577 * is_identifier_char from parser.h.
579 case 'a': case 'b': case 'c': case 'd': case 'e':
580 case 'f': case 'g': case 'h': case 'i': case 'j':
581 case 'k': case 'l': case 'm': case 'n': case 'o':
582 case 'p': case 'q': case 'r': case 's': case 't':
583 case 'u': case 'v': case 'w': case 'x': case 'y':
585 case 'A': case 'B': case 'C': case 'D': case 'E':
586 case 'F': case 'G': case 'H': case 'I': case 'J':
587 case 'K': case 'L': case 'M': case 'N': case 'O':
588 case 'P': case 'Q': case 'R': case 'S': case 'T':
589 case 'U': case 'V': case 'W': case 'X': case 'Y':
591 case '0': case '1': case '2': case '3': case '4':
592 case '5': case '6': case '7': case '8': case '9':
595 * Read in the first MAX_IDENTIFIER_LENGTH characters of the
596 * identifier into varname null terminated. Eat
597 * the rest of the characters of the identifier:
599 for (ptr = varname;;) {
600 if (ptr<varname+MAX_IDENTIFIER_LENGTH)
603 if (!is_identifier_char(c))
610 * Look up the identifier in the keyword dictionary.
611 * If its a match, return the keyword's #. In the case
612 * of show, call handle_show to do more processing.
613 * If not a match, treat as a variable name.
615 binding = int_dictionary_Lookup(keyword_dict, varname);
617 yylval.text = string_Copy(varname);
620 if (binding->value == SHOW)
621 return(handle_show());
623 return(binding->value);
626 * Handle "${identifier}". Note that $ followed by a
627 * non-identifier character is not the start of any valid token.
631 if (!is_identifier_char(c))
635 * Read in the first MAX_IDENTIFIER_LENGTH characters of the
636 * identifier into varname null terminated. Eat
637 * the rest of the characters of the identifier:
639 for (ptr = varname;;) {
640 if (ptr<varname+MAX_IDENTIFIER_LENGTH)
643 if (!is_identifier_char(c))
649 yylval.text = string_Copy(varname);
653 * Handle constant strings:
656 if (yylval.text = eat_string(yylineno))
662 * All other characters do not start valid tokens: