/* $OpenBSD: parse.y,v 1.11 2024/11/09 18:03:44 op Exp $ */ /* parse.y - parser for flex input */ %token CHAR NUMBER SECTEND SCDECL XSCDECL NAME PREVCCL EOF_OP %token OPTION_OP OPT_OUTFILE OPT_PREFIX OPT_YYCLASS OPT_HEADER OPT_EXTRA_TYPE %token OPT_TABLES %token CCE_ALNUM CCE_ALPHA CCE_BLANK CCE_CNTRL CCE_DIGIT CCE_GRAPH %token CCE_LOWER CCE_PRINT CCE_PUNCT CCE_SPACE CCE_UPPER CCE_XDIGIT %token CCE_NEG_ALNUM CCE_NEG_ALPHA CCE_NEG_BLANK CCE_NEG_CNTRL CCE_NEG_DIGIT CCE_NEG_GRAPH %token CCE_NEG_LOWER CCE_NEG_PRINT CCE_NEG_PUNCT CCE_NEG_SPACE CCE_NEG_UPPER CCE_NEG_XDIGIT %left CCL_OP_DIFF CCL_OP_UNION /* *POSIX and AT&T lex place the * precedence of the repeat operator, {}, below that of concatenation. * Thus, ab{3} is ababab. Most other POSIX utilities use an Extended * Regular Expression (ERE) precedence that has the repeat operator * higher than concatenation. This causes ab{3} to yield abbb. * * In order to support the POSIX and AT&T precedence and the flex * precedence we define two token sets for the begin and end tokens of * the repeat operator, '{' and '}'. The lexical scanner chooses * which tokens to return based on whether posix_compat or lex_compat * are specified. Specifying either posix_compat or lex_compat will * cause flex to parse scanner files as per the AT&T and * POSIX-mandated behavior. */ %token BEGIN_REPEAT_POSIX END_REPEAT_POSIX BEGIN_REPEAT_FLEX END_REPEAT_FLEX %{ /* Copyright (c) 1990 The Regents of the University of California. */ /* All rights reserved. */ /* This code is derived from software contributed to Berkeley by */ /* Vern Paxson. */ /* The United States Government has rights in this work pursuant */ /* to contract no. DE-AC03-76SF00098 between the United States */ /* Department of Energy and the University of California. */ /* This file is part of flex. */ /* Redistribution and use in source and binary forms, with or without */ /* modification, are permitted provided that the following conditions */ /* are met: */ /* 1. Redistributions of source code must retain the above copyright */ /* notice, this list of conditions and the following disclaimer. */ /* 2. Redistributions in binary form must reproduce the above copyright */ /* notice, this list of conditions and the following disclaimer in the */ /* documentation and/or other materials provided with the distribution. */ /* Neither the name of the University nor the names of its contributors */ /* may be used to endorse or promote products derived from this software */ /* without specific prior written permission. */ /* THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR */ /* IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED */ /* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR */ /* PURPOSE. */ #include "flexdef.h" #include "tables.h" int pat, scnum, eps, headcnt, trailcnt, lastchar, i, rulelen; int trlcontxt, xcluflg, currccl, cclsorted, varlength, variable_trail_rule; int *scon_stk; int scon_stk_ptr; static int madeany = false; /* whether we've made the '.' character class */ static int ccldot, cclany; int previous_continued_action; /* whether the previous rule's action was '|' */ #define format_warn3(fmt, a1, a2) \ do{ \ char fw3_msg[MAXLINE];\ snprintf( fw3_msg, MAXLINE,(fmt), (a1), (a2) );\ warn( fw3_msg );\ }while(0) /* Expand a POSIX character class expression. */ #define CCL_EXPR(func) \ do{ \ int c; \ for ( c = 0; c < csize; ++c ) \ if ( isascii(c) && func(c) ) \ ccladd( currccl, c ); \ }while(0) /* negated class */ #define CCL_NEG_EXPR(func) \ do{ \ int c; \ for ( c = 0; c < csize; ++c ) \ if ( !func(c) ) \ ccladd( currccl, c ); \ }while(0) /* On some over-ambitious machines, such as DEC Alpha's, the default * token type is "long" instead of "int"; this leads to problems with * declaring yylval in flexdef.h. But so far, all the yacc's I've seen * wrap their definitions of YYSTYPE with "#ifndef YYSTYPE"'s, so the * following should ensure that the default token type is "int". */ #define YYSTYPE int %} %% goal : initlex sect1 sect1end sect2 initforrule { /* add default rule */ int def_rule; pat = cclinit(); cclnegate( pat ); def_rule = mkstate( -pat ); /* Remember the number of the default rule so we * don't generate "can't match" warnings for it. */ default_rule = num_rules; finish_rule( def_rule, false, 0, 0, 0); for ( i = 1; i <= lastsc; ++i ) scset[i] = mkbranch( scset[i], def_rule ); if ( spprdflt ) add_action( "YY_FATAL_ERROR( \"flex scanner jammed\" )" ); else add_action( "ECHO" ); add_action( ";\n\tYY_BREAK\n" ); } ; initlex : { /* initialize for processing rules */ /* Create default DFA start condition. */ scinstal( "INITIAL", false ); } ; sect1 : sect1 startconddecl namelist1 | sect1 options | | error { synerr( _("unknown error processing section 1") ); } ; sect1end : SECTEND { check_options(); scon_stk = allocate_integer_array( lastsc + 1 ); scon_stk_ptr = 0; } ; startconddecl : SCDECL { xcluflg = false; } | XSCDECL { xcluflg = true; } ; namelist1 : namelist1 NAME { scinstal( nmstr, xcluflg ); } | NAME { scinstal( nmstr, xcluflg ); } | error { synerr( _("bad start condition list") ); } ; options : OPTION_OP optionlist ; optionlist : optionlist option | ; option : OPT_OUTFILE '=' NAME { outfilename = copy_string( nmstr ); did_outfilename = 1; } | OPT_EXTRA_TYPE '=' NAME { extra_type = copy_string( nmstr ); } | OPT_PREFIX '=' NAME { prefix = copy_string( nmstr ); } | OPT_YYCLASS '=' NAME { yyclass = copy_string( nmstr ); } | OPT_HEADER '=' NAME { headerfilename = copy_string( nmstr ); } | OPT_TABLES '=' NAME { tablesext = true; tablesfilename = copy_string( nmstr ); } ; sect2 : sect2 scon initforrule flexrule '\n' { scon_stk_ptr = $2; } | sect2 scon '{' sect2 '}' { scon_stk_ptr = $2; } | ; initforrule : { /* Initialize for a parse of one rule. */ trlcontxt = variable_trail_rule = varlength = false; trailcnt = headcnt = rulelen = 0; current_state_type = STATE_NORMAL; previous_continued_action = continued_action; in_rule = true; new_rule(); } ; flexrule : '^' rule { pat = $2; finish_rule( pat, variable_trail_rule, headcnt, trailcnt , previous_continued_action); if ( scon_stk_ptr > 0 ) { for ( i = 1; i <= scon_stk_ptr; ++i ) scbol[scon_stk[i]] = mkbranch( scbol[scon_stk[i]], pat ); } else { /* Add to all non-exclusive start conditions, * including the default (0) start condition. */ for ( i = 1; i <= lastsc; ++i ) if ( ! scxclu[i] ) scbol[i] = mkbranch( scbol[i], pat ); } if ( ! bol_needed ) { bol_needed = true; if ( performance_report > 1 ) pinpoint_message( "'^' operator results in sub-optimal performance" ); } } | rule { pat = $1; finish_rule( pat, variable_trail_rule, headcnt, trailcnt , previous_continued_action); if ( scon_stk_ptr > 0 ) { for ( i = 1; i <= scon_stk_ptr; ++i ) scset[scon_stk[i]] = mkbranch( scset[scon_stk[i]], pat ); } else { for ( i = 1; i <= lastsc; ++i ) if ( ! scxclu[i] ) scset[i] = mkbranch( scset[i], pat ); } } | EOF_OP { if ( scon_stk_ptr > 0 ) build_eof_action(); else { /* This EOF applies to all start conditions * which don't already have EOF actions. */ for ( i = 1; i <= lastsc; ++i ) if ( ! sceof[i] ) scon_stk[++scon_stk_ptr] = i; if ( scon_stk_ptr == 0 ) warn( "all start conditions already have <> rules" ); else build_eof_action(); } } | error { synerr( _("unrecognized rule") ); } ; scon_stk_ptr : { $$ = scon_stk_ptr; } ; scon : '<' scon_stk_ptr namelist2 '>' { $$ = $2; } | '<' '*' '>' { $$ = scon_stk_ptr; for ( i = 1; i <= lastsc; ++i ) { int j; for ( j = 1; j <= scon_stk_ptr; ++j ) if ( scon_stk[j] == i ) break; if ( j > scon_stk_ptr ) scon_stk[++scon_stk_ptr] = i; } } | { $$ = scon_stk_ptr; } ; namelist2 : namelist2 ',' sconname | sconname | error { synerr( _("bad start condition list") ); } ; sconname : NAME { if ( (scnum = sclookup( nmstr )) == 0 ) format_pinpoint_message( "undeclared start condition %s", nmstr ); else { for ( i = 1; i <= scon_stk_ptr; ++i ) if ( scon_stk[i] == scnum ) { format_warn( "<%s> specified twice", scname[scnum] ); break; } if ( i > scon_stk_ptr ) scon_stk[++scon_stk_ptr] = scnum; } } ; rule : re2 re { if ( transchar[lastst[$2]] != SYM_EPSILON ) /* Provide final transition \now/ so it * will be marked as a trailing context * state. */ $2 = link_machines( $2, mkstate( SYM_EPSILON ) ); mark_beginning_as_normal( $2 ); current_state_type = STATE_NORMAL; if ( previous_continued_action ) { /* We need to treat this as variable trailing * context so that the backup does not happen * in the action but before the action switch * statement. If the backup happens in the * action, then the rules "falling into" this * one's action will *also* do the backup, * erroneously. */ if ( ! varlength || headcnt != 0 ) warn( "trailing context made variable due to preceding '|' action" ); /* Mark as variable. */ varlength = true; headcnt = 0; } if ( lex_compat || (varlength && headcnt == 0) ) { /* variable trailing context rule */ /* Mark the first part of the rule as the * accepting "head" part of a trailing * context rule. * * By the way, we didn't do this at the * beginning of this production because back * then current_state_type was set up for a * trail rule, and add_accept() can create * a new state ... */ add_accept( $1, num_rules | YY_TRAILING_HEAD_MASK ); variable_trail_rule = true; } else trailcnt = rulelen; $$ = link_machines( $1, $2 ); } | re2 re '$' { synerr( _("trailing context used twice") ); } | re '$' { headcnt = 0; trailcnt = 1; rulelen = 1; varlength = false; current_state_type = STATE_TRAILING_CONTEXT; if ( trlcontxt ) { synerr( _("trailing context used twice") ); $$ = mkstate( SYM_EPSILON ); } else if ( previous_continued_action ) { /* See the comment in the rule for "re2 re" * above. */ warn( "trailing context made variable due to preceding '|' action" ); varlength = true; } if ( lex_compat || varlength ) { /* Again, see the comment in the rule for * "re2 re" above. */ add_accept( $1, num_rules | YY_TRAILING_HEAD_MASK ); variable_trail_rule = true; } trlcontxt = true; eps = mkstate( SYM_EPSILON ); $$ = link_machines( $1, link_machines( eps, mkstate( '\n' ) ) ); } | re { $$ = $1; if ( trlcontxt ) { if ( lex_compat || (varlength && headcnt == 0) ) /* Both head and trail are * variable-length. */ variable_trail_rule = true; else trailcnt = rulelen; } } ; re : re '|' series { varlength = true; $$ = mkor( $1, $3 ); } | series { $$ = $1; } ; re2 : re '/' { /* This rule is written separately so the * reduction will occur before the trailing * series is parsed. */ if ( trlcontxt ) synerr( _("trailing context used twice") ); else trlcontxt = true; if ( varlength ) /* We hope the trailing context is * fixed-length. */ varlength = false; else headcnt = rulelen; rulelen = 0; current_state_type = STATE_TRAILING_CONTEXT; $$ = $1; } ; series : series singleton { /* This is where concatenation of adjacent patterns * gets done. */ $$ = link_machines( $1, $2 ); } | singleton { $$ = $1; } | series BEGIN_REPEAT_POSIX NUMBER ',' NUMBER END_REPEAT_POSIX { varlength = true; if ( $3 > $5 || $3 < 0 ) { synerr( _("bad iteration values") ); $$ = $1; } else { if ( $3 == 0 ) { if ( $5 <= 0 ) { synerr( _("bad iteration values") ); $$ = $1; } else $$ = mkopt( mkrep( $1, 1, $5 ) ); } else $$ = mkrep( $1, $3, $5 ); } } | series BEGIN_REPEAT_POSIX NUMBER ',' END_REPEAT_POSIX { varlength = true; if ( $3 <= 0 ) { synerr( _("iteration value must be positive") ); $$ = $1; } else $$ = mkrep( $1, $3, INFINITE_REPEAT ); } | series BEGIN_REPEAT_POSIX NUMBER END_REPEAT_POSIX { /* The series could be something like "(foo)", * in which case we have no idea what its length * is, so we punt here. */ varlength = true; if ( $3 <= 0 ) { synerr( _("iteration value must be positive") ); $$ = $1; } else $$ = link_machines( $1, copysingl( $1, $3 - 1 ) ); } ; singleton : singleton '*' { varlength = true; $$ = mkclos( $1 ); } | singleton '+' { varlength = true; $$ = mkposcl( $1 ); } | singleton '?' { varlength = true; $$ = mkopt( $1 ); } | singleton BEGIN_REPEAT_FLEX NUMBER ',' NUMBER END_REPEAT_FLEX { varlength = true; if ( $3 > $5 || $3 < 0 ) { synerr( _("bad iteration values") ); $$ = $1; } else { if ( $3 == 0 ) { if ( $5 <= 0 ) { synerr( _("bad iteration values") ); $$ = $1; } else $$ = mkopt( mkrep( $1, 1, $5 ) ); } else $$ = mkrep( $1, $3, $5 ); } } | singleton BEGIN_REPEAT_FLEX NUMBER ',' END_REPEAT_FLEX { varlength = true; if ( $3 <= 0 ) { synerr( _("iteration value must be positive") ); $$ = $1; } else $$ = mkrep( $1, $3, INFINITE_REPEAT ); } | singleton BEGIN_REPEAT_FLEX NUMBER END_REPEAT_FLEX { /* The singleton could be something like "(foo)", * in which case we have no idea what its length * is, so we punt here. */ varlength = true; if ( $3 <= 0 ) { synerr( _("iteration value must be positive") ); $$ = $1; } else $$ = link_machines( $1, copysingl( $1, $3 - 1 ) ); } | '.' { if ( ! madeany ) { /* Create the '.' character class. */ ccldot = cclinit(); ccladd( ccldot, '\n' ); cclnegate( ccldot ); if ( useecs ) mkeccl( ccltbl + cclmap[ccldot], ccllen[ccldot], nextecm, ecgroup, csize, csize ); /* Create the (?s:'.') character class. */ cclany = cclinit(); cclnegate( cclany ); if ( useecs ) mkeccl( ccltbl + cclmap[cclany], ccllen[cclany], nextecm, ecgroup, csize, csize ); madeany = true; } ++rulelen; if (sf_dot_all()) $$ = mkstate( -cclany ); else $$ = mkstate( -ccldot ); } | fullccl { /* Sort characters for fast searching. */ qsort( ccltbl + cclmap[$1], ccllen[$1], sizeof (*ccltbl), cclcmp ); if ( useecs ) mkeccl( ccltbl + cclmap[$1], ccllen[$1], nextecm, ecgroup, csize, csize ); ++rulelen; if (ccl_has_nl[$1]) rule_has_nl[num_rules] = true; $$ = mkstate( -$1 ); } | PREVCCL { ++rulelen; if (ccl_has_nl[$1]) rule_has_nl[num_rules] = true; $$ = mkstate( -$1 ); } | '"' string '"' { $$ = $2; } | '(' re ')' { $$ = $2; } | CHAR { ++rulelen; if ($1 == nlch) rule_has_nl[num_rules] = true; if (sf_case_ins() && has_case($1)) /* create an alternation, as in (a|A) */ $$ = mkor (mkstate($1), mkstate(reverse_case($1))); else $$ = mkstate( $1 ); } ; fullccl: fullccl CCL_OP_DIFF braceccl { $$ = ccl_set_diff ($1, $3); } | fullccl CCL_OP_UNION braceccl { $$ = ccl_set_union ($1, $3); } | braceccl ; braceccl: '[' ccl ']' { $$ = $2; } | '[' '^' ccl ']' { cclnegate( $3 ); $$ = $3; } ; ccl : ccl CHAR '-' CHAR { if (sf_case_ins()) { /* If one end of the range has case and the other * does not, or the cases are different, then we're not * sure what range the user is trying to express. * Examples: [@-z] or [S-t] */ if (has_case ($2) != has_case ($4) || (has_case ($2) && (b_islower ($2) != b_islower ($4))) || (has_case ($2) && (b_isupper ($2) != b_isupper ($4)))) format_warn3 ( _("the character range [%c-%c] is ambiguous in a case-insensitive scanner"), $2, $4); /* If the range spans uppercase characters but not * lowercase (or vice-versa), then should we automatically * include lowercase characters in the range? * Example: [@-_] spans [a-z] but not [A-Z] */ else if (!has_case ($2) && !has_case ($4) && !range_covers_case ($2, $4)) format_warn3 ( _("the character range [%c-%c] is ambiguous in a case-insensitive scanner"), $2, $4); } if ( $2 > $4 ) synerr( _("negative range in character class") ); else { for ( i = $2; i <= $4; ++i ) ccladd( $1, i ); /* Keep track if this ccl is staying in * alphabetical order. */ cclsorted = cclsorted && ($2 > lastchar); lastchar = $4; /* Do it again for upper/lowercase */ if (sf_case_ins() && has_case($2) && has_case($4)){ $2 = reverse_case ($2); $4 = reverse_case ($4); for ( i = $2; i <= $4; ++i ) ccladd( $1, i ); cclsorted = cclsorted && ($2 > lastchar); lastchar = $4; } } $$ = $1; } | ccl CHAR { ccladd( $1, $2 ); cclsorted = cclsorted && ($2 > lastchar); lastchar = $2; /* Do it again for upper/lowercase */ if (sf_case_ins() && has_case($2)){ $2 = reverse_case ($2); ccladd ($1, $2); cclsorted = cclsorted && ($2 > lastchar); lastchar = $2; } $$ = $1; } | ccl ccl_expr { /* Too hard to properly maintain cclsorted. */ cclsorted = false; $$ = $1; } | { cclsorted = true; lastchar = 0; currccl = $$ = cclinit(); } ; ccl_expr: CCE_ALNUM { CCL_EXPR(isalnum); } | CCE_ALPHA { CCL_EXPR(isalpha); } | CCE_BLANK { CCL_EXPR(isblank); } | CCE_CNTRL { CCL_EXPR(iscntrl); } | CCE_DIGIT { CCL_EXPR(isdigit); } | CCE_GRAPH { CCL_EXPR(isgraph); } | CCE_LOWER { CCL_EXPR(islower); if (sf_case_ins()) CCL_EXPR(isupper); } | CCE_PRINT { CCL_EXPR(isprint); } | CCE_PUNCT { CCL_EXPR(ispunct); } | CCE_SPACE { CCL_EXPR(isspace); } | CCE_XDIGIT { CCL_EXPR(isxdigit); } | CCE_UPPER { CCL_EXPR(isupper); if (sf_case_ins()) CCL_EXPR(islower); } | CCE_NEG_ALNUM { CCL_NEG_EXPR(isalnum); } | CCE_NEG_ALPHA { CCL_NEG_EXPR(isalpha); } | CCE_NEG_BLANK { CCL_NEG_EXPR(isblank); } | CCE_NEG_CNTRL { CCL_NEG_EXPR(iscntrl); } | CCE_NEG_DIGIT { CCL_NEG_EXPR(isdigit); } | CCE_NEG_GRAPH { CCL_NEG_EXPR(isgraph); } | CCE_NEG_PRINT { CCL_NEG_EXPR(isprint); } | CCE_NEG_PUNCT { CCL_NEG_EXPR(ispunct); } | CCE_NEG_SPACE { CCL_NEG_EXPR(isspace); } | CCE_NEG_XDIGIT { CCL_NEG_EXPR(isxdigit); } | CCE_NEG_LOWER { if ( sf_case_ins() ) warn(_("[:^lower:] is ambiguous in case insensitive scanner")); else CCL_NEG_EXPR(islower); } | CCE_NEG_UPPER { if ( sf_case_ins() ) warn(_("[:^upper:] ambiguous in case insensitive scanner")); else CCL_NEG_EXPR(isupper); } ; string : string CHAR { if ( $2 == nlch ) rule_has_nl[num_rules] = true; ++rulelen; if (sf_case_ins() && has_case($2)) $$ = mkor (mkstate($2), mkstate(reverse_case($2))); else $$ = mkstate ($2); $$ = link_machines( $1, $$); } | { $$ = mkstate( SYM_EPSILON ); } ; %% /* build_eof_action - build the "<>" action for the active start * conditions */ void build_eof_action(void) { int i; char action_text[MAXLINE]; for ( i = 1; i <= scon_stk_ptr; ++i ) { if ( sceof[scon_stk[i]] ) format_pinpoint_message( "multiple <> rules for start condition %s", scname[scon_stk[i]] ); else { sceof[scon_stk[i]] = true; if (previous_continued_action /* && previous action was regular */) add_action("YY_RULE_SETUP\n"); snprintf( action_text, sizeof(action_text), "case YY_STATE_EOF(%s):\n", scname[scon_stk[i]] ); add_action( action_text ); } } line_directive_out( (FILE *) 0, 1 ); /* This isn't a normal rule after all - don't count it as * such, so we don't have any holes in the rule numbering * (which make generating "rule can never match" warnings * more difficult. */ --num_rules; ++num_eof_rules; } /* format_synerr - write out formatted syntax error */ void format_synerr(const char *msg, const char arg[]) { char errmsg[MAXLINE]; (void) snprintf( errmsg, sizeof(errmsg), msg, arg ); synerr( errmsg ); } /* synerr - report a syntax error */ void synerr(const char *str) { syntaxerror = true; pinpoint_message( str ); } /* format_warn - write out formatted warning */ void format_warn(const char *msg, const char arg[]) { char warn_msg[MAXLINE]; snprintf( warn_msg, sizeof(warn_msg), msg, arg ); warn( warn_msg ); } /* warn - report a warning, unless -w was given */ void warn(const char *str) { line_warning( str, linenum ); } /* format_pinpoint_message - write out a message formatted with one string, * pinpointing its location */ void format_pinpoint_message(const char *msg, const char arg[]) { char errmsg[MAXLINE]; snprintf( errmsg, sizeof(errmsg), msg, arg ); pinpoint_message( errmsg ); } /* pinpoint_message - write out a message, pinpointing its location */ void pinpoint_message(const char *str) { line_pinpoint( str, linenum ); } /* line_warning - report a warning at a given line, unless -w was given */ void line_warning(const char *str, int line) { char warning[MAXLINE]; if ( ! nowarn ) { snprintf( warning, sizeof(warning), "warning, %s", str ); line_pinpoint( warning, line ); } } /* line_pinpoint - write out a message, pinpointing it at the given line */ void line_pinpoint(const char *str, int line) { fprintf( stderr, "%s:%d: %s\n", infilename, line, str ); } /* yyerror - eat up an error message from the parser; * currently, messages are ignore */ void yyerror(const char *msg) { }