From 4e435b1321f09674ab3345ba8e99bc53f20f9b68 Mon Sep 17 00:00:00 2001
From: Jesse Beder <jbeder+github@gmail.com>
Date: Fri, 27 Jun 2008 08:20:41 +0000
Subject: [PATCH] Wrote a simplified regular expression parser to make life
 easier (it only does single matches; i.e., no one-or-more matches, etc.).
 Fixed some of the whitespace/line break matching.

---
 main.cpp           |  17 ++++++
 regex.cpp          | 114 ++++++++++++++++++++++++++++++++++++++
 regex.h            |  37 +++++++++++++
 scanner.cpp        | 133 ++++++++++++++++++++++-----------------------
 scanner.h          |  37 +++++++++++--
 test.yaml          |   6 +-
 yaml-reader.vcproj |   8 +++
 7 files changed, 277 insertions(+), 75 deletions(-)
 create mode 100644 regex.cpp
 create mode 100644 regex.h

diff --git a/main.cpp b/main.cpp
index f5f3fa89a4..6ff8af6424 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,7 +1,24 @@
 #include "document.h"
+#include "regex.h"
 
 int main()
 {
+	YAML::RegEx alpha = YAML::RegEx('a', 'z') || YAML::RegEx('A', 'Z');
+	alpha.Matches("a");
+	alpha.Matches("d");
+	alpha.Matches("F");
+	alpha.Matches("0");
+	alpha.Matches("5");
+	alpha.Matches(" ");
+
+	YAML::RegEx blank = YAML::RegEx(' ') || YAML::RegEx('\t');
+	YAML::RegEx docstart = YAML::RegEx("---") + (blank || YAML::RegEx(EOF) || YAML::RegEx());
+	docstart.Matches("--- ");
+	docstart.Matches("... ");
+	docstart.Matches("----");
+	docstart.Matches("---\t");
+	docstart.Matches("---");
+
 	YAML::Document doc("test.yaml");
 
 	return 0;
diff --git a/regex.cpp b/regex.cpp
new file mode 100644
index 0000000000..a2907f513f
--- /dev/null
+++ b/regex.cpp
@@ -0,0 +1,114 @@
+#include "regex.h"
+
+namespace YAML
+{
+	RegEx::RegEx(REGEX_OP op): m_op(op)
+	{
+	}
+
+	RegEx::RegEx(): m_op(REGEX_EMPTY)
+	{
+	}
+
+	RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch)
+	{
+	}
+
+	RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z)
+	{
+	}
+
+	RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op)
+	{
+		for(unsigned i=0;i<str.size();i++)
+			m_params.push_back(RegEx(str[0]));
+	}
+
+	RegEx::~RegEx()
+	{
+	}
+
+	bool RegEx::Matches(char ch) const
+	{
+		std::string str;
+		str += ch;
+		return Matches(str);
+	}
+
+	bool RegEx::Matches(const std::string& str) const
+	{
+		return Match(str) >= 0;
+	}
+
+	// Match
+	// . Matches the given string against this regular expression.
+	// . Returns the number of characters matched.
+	// . Returns -1 if no characters were matched (the reason for
+	//   not returning zero is that we may have an empty regex
+	//   which SHOULD be considered successfully matching nothing,
+	//   but that of course matches zero characters).
+	int RegEx::Match(const std::string& str) const
+	{
+		switch(m_op) {
+			case REGEX_EMPTY:
+				if(str.empty())
+					return 0;
+				return -1;
+			case REGEX_MATCH:
+				if(str.empty() || str[0] != m_a)
+					return -1;
+				return 1;
+			case REGEX_RANGE:
+				if(str.empty() || m_a > str[0] || m_z < str[0])
+					return -1;
+				return 1;
+			case REGEX_NOT:
+				if(m_params.empty())
+					return false;
+				if(m_params[0].Match(str) >= 0)
+					return -1;
+				return 1;
+			case REGEX_OR:
+				for(unsigned i=0;i<m_params.size();i++) {
+					int n = m_params[i].Match(str);
+					if(n >= 0)
+						return n;
+				}
+				return -1;
+			case REGEX_SEQ:
+				int offset = 0;
+				for(unsigned i=0;i<m_params.size();i++) {
+					int n = m_params[i].Match(str.substr(offset));
+					if(n == -1)
+						return -1;
+					offset += n;
+				}
+				return offset;
+		}
+
+		return -1;
+	}
+
+	RegEx operator ! (const RegEx& ex)
+	{
+		RegEx ret(REGEX_NOT);
+		ret.m_params.push_back(ex);
+		return ret;
+	}
+
+	RegEx operator || (const RegEx& ex1, const RegEx& ex2)
+	{
+		RegEx ret(REGEX_OR);
+		ret.m_params.push_back(ex1);
+		ret.m_params.push_back(ex2);
+		return ret;
+	}
+
+	RegEx operator + (const RegEx& ex1, const RegEx& ex2)
+	{
+		RegEx ret(REGEX_SEQ);
+		ret.m_params.push_back(ex1);
+		ret.m_params.push_back(ex2);
+		return ret;
+	}
+}
diff --git a/regex.h b/regex.h
new file mode 100644
index 0000000000..08d7acc147
--- /dev/null
+++ b/regex.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <vector>
+#include <string>
+
+namespace YAML
+{
+	enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_NOT, REGEX_SEQ };
+
+	// simplified regular expressions
+	// . Only straightforward matches (no repeated characters)
+	// . Only matches from start of string
+	class RegEx {
+	public:
+		RegEx();
+		RegEx(char ch);
+		RegEx(char a, char z);
+		RegEx(const std::string& str, REGEX_OP op = REGEX_SEQ); 
+		~RegEx();
+
+		bool Matches(char ch) const;
+		bool Matches(const std::string& str) const;
+		int Match(const std::string& str) const;
+
+		friend RegEx operator ! (const RegEx& ex);
+		friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
+		friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
+
+	private:
+		RegEx(REGEX_OP op);
+
+	private:
+		REGEX_OP m_op;
+		char m_a, m_z;
+		std::vector <RegEx> m_params;
+	};
+}
diff --git a/scanner.cpp b/scanner.cpp
index 6fed538d7d..e1495f5baa 100644
--- a/scanner.cpp
+++ b/scanner.cpp
@@ -30,7 +30,10 @@ namespace YAML
 	char Scanner::GetChar()
 	{
 		m_column++;
-		return INPUT.get();
+		char ch = INPUT.get();
+		if(ch == '\n')
+			m_column = 0;
+		return ch;
 	}
 
 	// Eat
@@ -87,18 +90,6 @@ namespace YAML
 		return false;
 	}
 
-	// IsLineBreak
-	bool Scanner::IsLineBreak(char ch)
-	{
-		return ch == '\n'; // TODO: More types of line breaks
-	}
-
-	// IsBlank
-	bool Scanner::IsBlank(char ch)
-	{
-		return IsLineBreak(ch) || ch == ' ' || ch == '\t' || ch == EOF;
-	}
-
 	// IsDocumentStart
 	bool Scanner::IsDocumentStart()
 	{
@@ -106,8 +97,7 @@ namespace YAML
 		if(m_column != 0)
 			return false;
 
-		std::string next = Peek(4);
-		return next[0] == '-' && next[1] == '-' && next[2] == '-' && IsBlank(next[3]);
+		return Exp::DocStart.Matches(Peek(4));
 	}
 
 	// IsDocumentEnd
@@ -117,61 +107,41 @@ namespace YAML
 		if(m_column != 0)
 			return false;
 
-		std::string next = Peek(4);
-		return next[0] == '.' && next[1] == '.' && next[2] == '.' && IsBlank(next[3]);
+		return Exp::DocEnd.Matches(Peek(4));
 	}
 
 	// IsBlockEntry
 	bool Scanner::IsBlockEntry()
 	{
-		std::string next = Peek(2);
-		return next[0] == Keys::BlockEntry && IsBlank(next[1]);
+		return Exp::BlockEntry.Matches(Peek(2));
 	}
 
 	// IsKey
 	bool Scanner::IsKey()
 	{
 		std::string next = Peek(2);
-		return next[0] == Keys::Key && (IsBlank(next[1]) || m_flowLevel > 0);
+		if(m_flowLevel > 0)
+			return Exp::KeyInFlow.Matches(next);
+		return Exp::Key.Matches(next);
 	}
 
 	// IsValue
 	bool Scanner::IsValue()
 	{
 		std::string next = Peek(2);
-		return next[0] == Keys::Value && (IsBlank(next[1]) || m_flowLevel > 0);
+		if(m_flowLevel > 0)
+			return Exp::ValueInFlow.Matches(next);
+		return Exp::Value.Matches(next);
 	}
 
 	// IsPlainScalar
 	// . Rules:
-	//   . Cannot start with a blank.
-	//   . Can never start with any of , [ ] { } # & * ! | > \' \" % @ `
-	//   . In the block context - ? : must be not be followed with a space.
-	//   . In the flow context ? : are illegal and - must not be followed with a space.
 	bool Scanner::IsPlainScalar()
 	{
 		std::string next = Peek(2);
-
-		if(IsBlank(next[0]))
-			return false;
-
-		// never characters
-		if(std::string(",[]{}#&*!|>\'\"%@`").find(next[0]) != std::string::npos)
-			return false;
-
-		// specific block/flow characters
-		if(m_flowLevel == 0) {
-			if((next[0] == '-' || next[0] == '?' || next[0] == ':') && IsBlank(next[1]))
-				return false;
-		} else {
-			if(next[0] == '?' || next[0]  == ':')
-				return false;
-
-			if(next[0] == '-' && IsBlank(next[1]))
-				return false;
-		}
-
-		return true;
+		if(m_flowLevel > 0)
+			return Exp::PlainScalarInFlow.Matches(next);
+		return Exp::PlainScalar.Matches(next);
 	}
 
 	///////////////////////////////////////////////////////////////////////
@@ -233,7 +203,7 @@ namespace YAML
 	// DocumentEndToken
 	template <> DocumentEndToken *Scanner::ScanToken(DocumentEndToken *pToken)
 	{
-		PopIndentTo(m_column);
+		PopIndentTo(-1);
 		// TODO: "reset simple keys"
 
 		m_simpleKeyAllowed = false;
@@ -389,8 +359,8 @@ namespace YAML
 		m_simpleKeyAllowed = false;
 
 		// now eat and store the scalar
-		std::string scalar;
-		bool leadingBlanks = true;
+		std::string scalar, whitespace, leadingBreaks, trailingBreaks;
+		bool leadingBlanks = false;
 
 		while(INPUT) {
 			// doc start/end tokens
@@ -398,43 +368,72 @@ namespace YAML
 				break;
 
 			// comment
-			if(INPUT.peek() == Keys::Comment)
+			if(Exp::Comment.Matches(INPUT.peek()))
 				break;
 
 			// first eat non-blanks
-			while(INPUT && !IsBlank(INPUT.peek())) {
+			while(INPUT && !Exp::BlankOrBreak.Matches(INPUT.peek())) {
 				std::string next = Peek(2);
 
 				// illegal colon in flow context
-				if(m_flowLevel > 0 && next[0] == ':') {
-					if(!IsBlank(next[1]))
-						throw IllegalScalar();
-				}
+				if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(next))
+					throw IllegalScalar();
 
 				// characters that might end the scalar
-				if(next[0] == ':' && IsBlank(next[1]))
+				if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(next))
 					break;
-				if(m_flowLevel > 0 && std::string(",:?[]{}").find(next[0]) != std::string::npos)
+				if(m_flowLevel == 0 && Exp::EndScalar.Matches(next))
 					break;
 
+				if(leadingBlanks) {
+					if(!leadingBreaks.empty() && leadingBreaks[0] == '\n') {
+						// fold line break?
+						if(trailingBreaks.empty())
+							scalar += ' ';
+						else {
+							scalar += trailingBreaks;
+							trailingBreaks = "";
+						}
+					} else {
+						scalar += leadingBreaks + trailingBreaks;
+						leadingBreaks = "";
+						trailingBreaks = "";
+					}
+				} else if(!whitespace.empty()) {
+					scalar += whitespace;
+					whitespace = "";
+				}
+
+				// finally, read the character!
 				scalar += GetChar();
 			}
 
+			// did we hit a non-blank character that ended us?
+			if(!Exp::BlankOrBreak.Matches(INPUT.peek()))
+				break;
+
 			// now eat blanks
-			while(INPUT && (IsBlank(INPUT.peek()) /* || IsBreak(INPUT.peek()) */)) {
-				if(IsBlank(INPUT.peek())) {
+			while(INPUT && Exp::BlankOrBreak.Matches(INPUT.peek())) {
+				if(Exp::Blank.Matches(INPUT.peek())) {
 					if(leadingBlanks && m_column <= m_indents.top())
 						throw IllegalTabInScalar();
 
-					// TODO: Store some blanks?
-					Eat(1);
+					// maybe store this character
+					if(!leadingBlanks)
+						whitespace += GetChar();
+					else
+						Eat(1);
 				} else {
-					Eat(1);
+					// where to store this character?
+					if(!leadingBlanks) {
+						leadingBlanks = true;
+						whitespace = "";
+						leadingBreaks += GetChar();
+					} else
+						trailingBreaks += GetChar();
 				}
 			}
 
-			// TODO: join whitespace
-
 			// and finally break if we're below the indentation level
 			if(m_flowLevel == 0 && m_column <= m_indents.top())
 				break;
@@ -532,14 +531,14 @@ namespace YAML
 				Eat(1);
 
 			// then eat a comment
-			if(INPUT.peek() == Keys::Comment) {
+			if(Exp::Comment.Matches(INPUT.peek())) {
 				// eat until line break
-				while(INPUT && !IsLineBreak(INPUT.peek()))
+				while(INPUT && !Exp::Break.Matches(INPUT.peek()))
 					Eat(1);
 			}
 
 			// if it's NOT a line break, then we're done!
-			if(!IsLineBreak(INPUT.peek()))
+			if(!Exp::Break.Matches(INPUT.peek()))
 				break;
 
 			// otherwise, let's eat the line break and keep going
diff --git a/scanner.h b/scanner.h
index 83f2583c9f..94cd88ab0e 100644
--- a/scanner.h
+++ b/scanner.h
@@ -5,22 +5,49 @@
 #include <queue>
 #include <stack>
 #include <set>
+#include "regex.h"
 
 namespace YAML
 {
 	class Token;
 
+	namespace Exp
+	{
+		// misc
+		const RegEx Blank = RegEx(' ') || RegEx('\t');
+		const RegEx Break = RegEx('\n');
+		const RegEx BlankOrBreak = Blank || Break;
+
+		// actual tags
+
+		const RegEx DocStart = RegEx("---") + (BlankOrBreak || RegEx(EOF) || RegEx());
+		const RegEx DocEnd = RegEx("...") + (BlankOrBreak || RegEx(EOF) || RegEx());
+		const RegEx BlockEntry = RegEx('-') + (BlankOrBreak || RegEx(EOF));
+		const RegEx Key = RegEx('?'),
+		            KeyInFlow = RegEx('?') + BlankOrBreak;
+		const RegEx Value = RegEx(':'),
+		            ValueInFlow = RegEx(':') + BlankOrBreak;
+		const RegEx Comment = RegEx('#');
+
+		// Plain scalar rules:
+		// . Cannot start with a blank.
+		// . Can never start with any of , [ ] { } # & * ! | > \' \" % @ `
+		// . In the block context - ? : must be not be followed with a space.
+		// . In the flow context ? : are illegal and - must not be followed with a space.
+		const RegEx PlainScalar = !(BlankOrBreak || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:") + Blank)),
+	                PlainScalarInFlow = !(BlankOrBreak || RegEx("?:,[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx('-') + Blank));
+		const RegEx IllegalColonInScalar = RegEx(':') + !BlankOrBreak;
+		const RegEx EndScalar = RegEx(':') + BlankOrBreak,
+		            EndScalarInFlow = (RegEx(':') + BlankOrBreak) || RegEx(",:?[]{}");
+	}
+
 	namespace Keys
 	{
-		const char Comment = '#';
 		const char FlowSeqStart = '[';
 		const char FlowSeqEnd = ']';
 		const char FlowMapStart = '{';
 		const char FlowMapEnd = '}';
 		const char FlowEntry = ',';
-		const char BlockEntry = '-';
-		const char Key = '?';
-		const char Value = ':';
 		const char Alias = '*';
 		const char Anchor = '&';
 		const char Tag = '!';
@@ -49,8 +76,6 @@ namespace YAML
 		void EatLineBreak();
 
 		bool IsWhitespaceToBeEaten(char ch);
-		bool IsLineBreak(char ch);
-		bool IsBlank(char ch);
 		bool IsDocumentStart();
 		bool IsDocumentEnd();
 		bool IsBlockEntry();
diff --git a/test.yaml b/test.yaml
index 0581ec1257..d3ad2c7190 100644
--- a/test.yaml
+++ b/test.yaml
@@ -1,3 +1,5 @@
+---
 - milk
-- eggs
-- cheese and bread       # this is really important!
+- eggs             # this is really important!
+- cheese and bread
+...
\ No newline at end of file
diff --git a/yaml-reader.vcproj b/yaml-reader.vcproj
index 04a440f9ab..e0c32de3f9 100644
--- a/yaml-reader.vcproj
+++ b/yaml-reader.vcproj
@@ -185,6 +185,10 @@
 				RelativePath=".\parser.cpp"
 				>
 			</File>
+			<File
+				RelativePath=".\regex.cpp"
+				>
+			</File>
 			<File
 				RelativePath=".\scalar.cpp"
 				>
@@ -227,6 +231,10 @@
 				RelativePath=".\parser.h"
 				>
 			</File>
+			<File
+				RelativePath=".\regex.h"
+				>
+			</File>
 			<File
 				RelativePath=".\scalar.h"
 				>