glsl/glcpp: Correctly parse directives with intervening comments

It's legal (though highly bizarre) for a pre-processor directive to look like this: # /* why? */ define FOO bar This behavior comes about since the specification defines separate logical phases in a precise order, and comment-removal occurs in a phase before the identification of directives. Our implementation does not use an actual separate phase for comment removal, so some extra care is necessary to correctly parse this. What we want is for '#' to introduce a directive iff it is the first token on a line, (ignoring whitespace and comments). Previously, we had a lexical rule that worked only for whitespace (not comments) with the following regular expression to find a directive-introducing '#' at the beginning of a line: HASH ^{HSPACE}*#{HSPACE}* In this commit, we switch to instead use a simple literal match of '#' to return a HASH_TOKEN token and add a new <HASH> start condition for whenever the HASH_TOKEN is the first non-space token of a line. This requires the addition of the new bit of state: first_non_space_token_this_line. This approach has a couple of implications on the glcpp parser: 1. The parser now sees two separate tokens, (such as HASH_TOKEN and HASH_DEFINE) where it previously saw one token (HASH_DEFINE) for the sequence "#define". This is a straightforward change throughout the grammar. 2. The parser may now see a SPACE token before the HASH_TOKEN token of a directive. Previously the lexical regular expression for {HASH} would eat up the space and there would be no SPACE token. This second implication is a bit of a nuisance for the parser. It causes a SPACE token to appear in a production of the grammar with the following two definitions of a control_line: control_line SPACE control_line This is really ugly, since normally a space would simply be a token separator, so it wouldn't appear in the tokens of a production. This leads to a further problem with interleaved spaces and comments: /* ... */ /* ... */ #define /* ..*/ For this, we must not return several consecutive SPACE tokens, or else we would need an arbitrary number of new productions: SPACE SPACE control_line SPACE SPACE SPACE control_line ad nauseam To avoid this problem, in this commit we also change the lexer to emit only a single SPACE token for any series of consecutive spaces, (whether from actual whitespace or comments). For this compression, we add a new bit of parser state: last_token_was_space. And we also update the expected results of all necessary test cases for the new compression of space tokens. Fortunately, the compression of spaces should not lead to any semantic changes in terms of what the eventual GLSL compiler sees. So there's a lot happening in this commit, (particularly for such a tiny feature). But fortunately, the lexer itself is looking cleaner than ever. The only ugly bit is all the state updating, but it is at least isolated to a single shared function. Of course, a new "make check" test is added for the new feature, (directives with comments and whitespace interleaved in many combinations). And this commit fixes the following Khronos GLES3 CTS tests: function_definition_with_comments_vertex function_definition_with_comments_fragment Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
author: Carl Worth <cworth@cworth.org> 2014-06-25 12:20:22 -0700
committer: Carl Worth <cworth@cworth.org> 2014-07-29 15:11:50 -0700
commit: f062f0506a5b827667b7eb52136d8420b7e8113b (patch)
tree: 6f758f87be25365ee6a72c2f68776f4f54afbd7e /src/glsl/glcpp/glcpp-lex.l
parent: dfdf9dc082cbab332457ea2dbe012eeb0d164ce4 (diff)
1 files changed, 120 insertions, 42 deletions
diff --git a/src/glsl/glcpp/glcpp-lex.l b/src/glsl/glcpp/glcpp-lex.l
index 5a5bbe1886a..60bc0800b2d 100644
--- a/src/glsl/glcpp/glcpp-lex.l
+++ b/src/glsl/glcpp/glcpp-lex.l
@@ -52,7 +52,7 @@ void glcpp_set_column (int  column_no , yyscan_t yyscanner);
 		yylloc->last_column = yycolumn + 1;			\
 		parser->has_new_line_number = 0;			\
 		parser->has_new_source_number = 0;			\
- } while(0);
+	} while(0);
 
 #define YY_USER_INIT			\
 	do {				\
@@ -85,13 +85,10 @@ void glcpp_set_column (int  column_no , yyscan_t yyscanner);
  * of RETURN_TOKEN that performs a string copy of yytext before the
  * return.
  */
-#define RETURN_TOKEN_NEVER_SKIP(token)				\
-	do {							\
-		if (token == NEWLINE)				\
-			parser->last_token_was_newline = 1;	\
-		else						\
-			parser->last_token_was_newline = 0;	\
-		return (token);					\
+#define RETURN_TOKEN_NEVER_SKIP(token)					\
+	do {								\
+		if (glcpp_lex_update_state_per_token (parser, token))	\
+			return token;					\
 	} while (0)
 
 #define RETURN_TOKEN(token)						\
@@ -109,6 +106,53 @@ void glcpp_set_column (int  column_no , yyscan_t yyscanner);
 		}							\
 	} while(0)
 
+
+/* Update all state necessary for each token being returned.
+ *
+ * Here we'll be tracking newlines and spaces so that the lexer can
+ * alter its behavior as necessary, (for example, '#' has special
+ * significance if it is the first non-whitespace, non-comment token
+ * in a line, but does not otherwise).
+ *
+ * NOTE: If this function returns FALSE, then no token should be
+ * returned at all. This is used to suprress duplicate SPACE tokens.
+ */
+static int
+glcpp_lex_update_state_per_token (glcpp_parser_t *parser, int token)
+{
+	/* After the first non-space token in a line, we won't
+	 * allow any '#' to introduce a directive. */
+	if (token == NEWLINE) {
+		parser->first_non_space_token_this_line = 1;
+	} else if (token != SPACE) {
+		parser->first_non_space_token_this_line = 0;
+	}
+
+	/* Track newlines just to know whether a newline needs
+	 * to be inserted if end-of-file comes early. */
+	if (token == NEWLINE) {
+		parser->last_token_was_newline = 1;
+	} else {
+		parser->last_token_was_newline = 0;
+	}
+
+	/* Track spaces to avoid emitting multiple SPACE
+	 * tokens in a row. */
+	if (token == SPACE) {
+		if (! parser->last_token_was_space) {
+			parser->last_token_was_space = 1;
+			return 1;
+		} else {
+			parser->last_token_was_space = 1;
+			return 0;
+		}
+	} else {
+		parser->last_token_was_space = 0;
+		return 1;
+	}
+}
+
+
 %}
 
 %option bison-bridge bison-locations reentrant noyywrap
@@ -117,13 +161,13 @@ void glcpp_set_column (int  column_no , yyscan_t yyscanner);
 %option stack
 %option never-interactive
 
-%x DONE COMMENT UNREACHABLE DEFINE NEWLINE_CATCHUP
+%x DONE COMMENT HASH UNREACHABLE DEFINE NEWLINE_CATCHUP
 
 SPACE		[[:space:]]
 NONSPACE	[^[:space:]]
 NEWLINE		[\n]
 HSPACE		[ \t]
-HASH		^{HSPACE}*#{HSPACE}*
+HASH		#
 IDENTIFIER	[_a-zA-Z][_a-zA-Z0-9]*
 PP_NUMBER	[.]?[0-9]([._a-zA-Z0-9]|[eEpP][-+])*
 PUNCTUATION	[][(){}.&*~!/%<>^|;,=+-]
@@ -160,7 +204,7 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 			parser->commented_newlines--;
 		if (parser->commented_newlines == 0)
 			BEGIN INITIAL;
-		RETURN_TOKEN (NEWLINE);
+		RETURN_TOKEN_NEVER_SKIP (NEWLINE);
 	}
 
 	/* Set up the parser->skipping bit here before doing any lexing.
@@ -206,77 +250,103 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 }
 
 	/* Multi-line comments */
-<DEFINE,INITIAL>"/*"                    { yy_push_state(COMMENT, yyscanner); }
+<DEFINE,HASH,INITIAL>"/*"                    { yy_push_state(COMMENT, yyscanner); }
 <COMMENT>[^*\n]*
 <COMMENT>[^*\n]*\n      { yylineno++; yycolumn = 0; parser->commented_newlines++; }
 <COMMENT>"*"+[^*/\n]*
 <COMMENT>"*"+[^*/\n]*\n { yylineno++; yycolumn = 0; parser->commented_newlines++; }
 <COMMENT>"*"+"/"        {
 	yy_pop_state(yyscanner);
-	if (yyextra->space_tokens)
+	/* In the <HASH> start condition, we don't want any SPACE token. */
+	if (yyextra->space_tokens && YY_START != HASH)
 		RETURN_TOKEN (SPACE);
 }
 
-{HASH}version{HSPACE}+ {
+{HASH} {
+
+	/* If the '#' is the first non-whitespace, non-comment token on this
+	 * line, then it introduces a directive, switch to the <HASH> start
+	 * condition.
+	 *
+	 * Otherwise, this is just punctuation, so return the HASH_TOKEN
+         * token. */
+	if (parser->first_non_space_token_this_line) {
+		BEGIN HASH;
+	}
+
+	RETURN_TOKEN_NEVER_SKIP (HASH_TOKEN);
+}
+
+<HASH>version{HSPACE}+ {
+	BEGIN INITIAL;
 	yyextra->space_tokens = 0;
 	RETURN_STRING_TOKEN (HASH_VERSION);
 }
 
 	/* glcpp doesn't handle #extension, #version, or #pragma directives.
 	 * Simply pass them through to the main compiler's lexer/parser. */
-{HASH}(extension|pragma)[^\n]* {
+<HASH>(extension|pragma)[^\n]* {
+	BEGIN INITIAL;
 	yylineno++;
 	yycolumn = 0;
-	RETURN_STRING_TOKEN (OTHER);
+	RETURN_STRING_TOKEN (HASH_PRAGMA);
 }
 
-{HASH}line{HSPACE}+ {
+<HASH>line{HSPACE}+ {
+	BEGIN INITIAL;
 	RETURN_TOKEN (HASH_LINE);
 }
 
+<HASH>\n {
+	BEGIN INITIAL;
+	RETURN_TOKEN_NEVER_SKIP (NEWLINE);
+}
+
 	/* For the pre-processor directives, we return these tokens
 	 * even when we are otherwise skipping. */
-{HASH}ifdef {
+<HASH>ifdef {
+	BEGIN INITIAL;
 	yyextra->lexing_directive = 1;
 	yyextra->space_tokens = 0;
 	RETURN_TOKEN_NEVER_SKIP (HASH_IFDEF);
 }
 
-{HASH}ifndef {
+<HASH>ifndef {
+	BEGIN INITIAL;
 	yyextra->lexing_directive = 1;
 	yyextra->space_tokens = 0;
 	RETURN_TOKEN_NEVER_SKIP (HASH_IFNDEF);
 }
 
-{HASH}if/[^_a-zA-Z0-9] {
+<HASH>if/[^_a-zA-Z0-9] {
+	BEGIN INITIAL;
 	yyextra->lexing_directive = 1;
 	yyextra->space_tokens = 0;
 	RETURN_TOKEN_NEVER_SKIP (HASH_IF);
 }
 
-{HASH}elif/[^_a-zA-Z0-9] {
+<HASH>elif/[^_a-zA-Z0-9] {
+	BEGIN INITIAL;
 	yyextra->lexing_directive = 1;
 	yyextra->space_tokens = 0;
 	RETURN_TOKEN_NEVER_SKIP (HASH_ELIF);
 }
 
-{HASH}else {
+<HASH>else {
+	BEGIN INITIAL;
 	yyextra->space_tokens = 0;
 	RETURN_TOKEN_NEVER_SKIP (HASH_ELSE);
 }
 
-{HASH}endif {
+<HASH>endif {
+	BEGIN INITIAL;
 	yyextra->space_tokens = 0;
 	RETURN_TOKEN_NEVER_SKIP (HASH_ENDIF);
 }
 
-{HASH}error.* {
-	if (! parser->skipping) {
-		char *p;
-		for (p = yytext; !isalpha(p[0]); p++); /* skip "  #   " */
-		p += 5; /* skip "error" */
-		glcpp_error(yylloc, yyextra, "#error%s", p);
-	}
+<HASH>error.* {
+	BEGIN INITIAL;
+	RETURN_STRING_TOKEN (HASH_ERROR);
 }
 
 	/* After we see a "#define" we enter the <DEFINE> start state
@@ -297,7 +367,7 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 	 *	* Anything else, (not an identifier, not a comment,
 	 *	  and not whitespace). This will generate an error.
 	 */
-{HASH}define{HSPACE}+ {
+<HASH>define{HSPACE}+ {
 	if (! parser->skipping) {
 		BEGIN DEFINE;
 		yyextra->space_tokens = 0;
@@ -305,6 +375,24 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 	}
 }
 
+<HASH>undef {
+	BEGIN INITIAL;
+	yyextra->space_tokens = 0;
+	RETURN_TOKEN (HASH_UNDEF);
+}
+
+<HASH>{HSPACE}+ {
+	/* Nothing to do here. Importantly, don't leave the <HASH>
+	 * start condition, since it's legal to have space between the
+	 * '#' and the directive.. */
+}
+
+	/* This will catch any non-directive garbage after a HASH */
+<HASH>{NONSPACE} {
+	BEGIN INITIAL;
+	RETURN_TOKEN (HASH_GARBAGE);
+}
+
 	/* An identifier immediately followed by '(' */
 <DEFINE>{IDENTIFIER}/"(" {
 	BEGIN INITIAL;
@@ -337,16 +425,6 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 	RETURN_STRING_TOKEN (INTEGER_STRING);
 }
 
-{HASH}undef {
-	yyextra->space_tokens = 0;
-	RETURN_TOKEN (HASH_UNDEF);
-}
-
-{HASH} {
-	yyextra->space_tokens = 0;
-	RETURN_TOKEN (HASH_TOKEN);
-}
-
 {DECIMAL_INTEGER} {
 	RETURN_STRING_TOKEN (INTEGER_STRING);
 }
@@ -438,7 +516,7 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 	RETURN_TOKEN_NEVER_SKIP (NEWLINE);
 }
 
-<INITIAL,COMMENT,DEFINE><<EOF>> {
+<INITIAL,COMMENT,DEFINE,HASH><<EOF>> {
 	if (YY_START == COMMENT)
 		glcpp_error(yylloc, yyextra, "Unterminated comment");
 	if (YY_START == DEFINE)
author	Carl Worth <cworth@cworth.org>	2014-06-25 12:20:22 -0700
committer	Carl Worth <cworth@cworth.org>	2014-07-29 15:11:50 -0700
commit	f062f0506a5b827667b7eb52136d8420b7e8113b (patch)
tree	6f758f87be25365ee6a72c2f68776f4f54afbd7e /src/glsl/glcpp/glcpp-lex.l
parent	dfdf9dc082cbab332457ea2dbe012eeb0d164ce4 (diff)