Refactored common scalar scanning code (from plain, quoted, and block) to one function.

2025-07-04 22:11:26 +12:00 · 2008-06-29 05:45:41 +00:00 · 2008-06-29 05:45:41 +00:00 · 0d5a97bffe
commit 0d5a97bffe
parent 6c193d6fbd
6 changed files with 306 additions and 188 deletions
--- a/exp.cpp
+++ b/exp.cpp
@ -62,7 +62,7 @@ namespace YAML
 		}

 		// Escape
-		// . Escapes the sequence starting 'in' (it must begin with a '\')
+		// . Escapes the sequence starting 'in' (it must begin with a '\' or single quote)
 		//   and returns the result.
 		// . Fills 'length' with how many characters we ate.
 		// . Throws if it's an unknown escape character.
@ -72,10 +72,16 @@ namespace YAML
 			length = 2;

 			// eat slash
-			in.get();
+			char escape = in.get();

 			// switch on escape character
 			char ch = in.get();
+
+			// first do single quote, since it's easier
+			if(escape == '\'' && ch == '\'')
+				return "\'";
+
+			// now do the slash (we're not gonna check if it's a slash - you better pass one!)
 			switch(ch) {
 				case '0': return "\0";
 				case 'a': return "\x07";
--- a/regex.cpp
+++ b/regex.cpp
@ -53,6 +53,7 @@ namespace YAML
 			case REGEX_MATCH: m_pOp = new MatchOperator; break;
 			case REGEX_RANGE: m_pOp = new RangeOperator; break;
 			case REGEX_OR: m_pOp = new OrOperator; break;
+			case REGEX_AND: m_pOp = new AndOperator; break;
 			case REGEX_NOT: m_pOp = new NotOperator; break;
 			case REGEX_SEQ: m_pOp = new SeqOperator; break;
 		}
@ -80,19 +81,13 @@ namespace YAML
 	// . Returns the number of characters matched.
 	// . Returns -1 if no characters were matched (the reason for
 	//   not returning zero is that we may have an empty regex
-	//   which SHOULD be considered successfully matching nothing,
-	//   but that of course matches zero characters).
+	//   which is ALWAYS successful at matching zero characters).
 	int RegEx::Match(const std::string& str) const
 	{
 		if(!m_pOp)
-			return -1;
+			return 0;

 		return m_pOp->Match(str, *this);
-
-			//case REGEX_EMPTY:
-			//	if(str.empty())
-			//		return 0;
-			//	return -1;
 	}

 	// Match
@ -131,6 +126,14 @@ namespace YAML
 		return ret;
 	}

+	RegEx operator && (const RegEx& ex1, const RegEx& ex2)
+	{
+		RegEx ret(REGEX_AND);
+		ret.m_params.push_back(ex1);
+		ret.m_params.push_back(ex2);
+		return ret;
+	}
+
 	RegEx operator + (const RegEx& ex1, const RegEx& ex2)
 	{
 		RegEx ret(REGEX_SEQ);
@ -194,6 +197,36 @@ namespace YAML
 		return -1;
 	}

+	// AndOperator
+	// Note: 'AND' is a little funny, since we may be required to match things
+	//       of different lengths. If we find a match, we return the length of
+	//       the FIRST entry on the list.
+	int RegEx::AndOperator::Match(const std::string& str, const RegEx& regex) const
+	{
+		int first = -1;
+		for(unsigned i=0;i<regex.m_params.size();i++) {
+			int n = regex.m_params[i].Match(str);
+			if(n == -1)
+				return -1;
+			if(i == 0)
+				first = n;
+		}
+		return first;
+	}
+
+	int RegEx::AndOperator::Match(std::istream& in, const RegEx& regex) const
+	{
+		int first = -1;
+		for(unsigned i=0;i<regex.m_params.size();i++) {
+			int n = regex.m_params[i].Match(in);
+			if(n == -1)
+				return -1;
+			if(i == 0)
+				first = n;
+		}
+		return first;
+	}
+
 	// NotOperator
 	int RegEx::NotOperator::Match(const std::string& str, const RegEx& regex) const
 	{
--- a/regex.h
+++ b/regex.h
@ -6,7 +6,7 @@

 namespace YAML
 {
-	enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_NOT, REGEX_SEQ };
+	enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_AND, REGEX_NOT, REGEX_SEQ };

 	// simplified regular expressions
 	// . Only straightforward matches (no repeated characters)
@ -35,6 +35,11 @@ namespace YAML
 			virtual int Match(std::istream& in, const RegEx& regex) const;
 		};

+		struct AndOperator: public Operator {
+			virtual int Match(const std::string& str, const RegEx& regex) const;
+			virtual int Match(std::istream& in, const RegEx& regex) const;
+		};
+
 		struct NotOperator: public Operator {
 			virtual int Match(const std::string& str, const RegEx& regex) const;
 			virtual int Match(std::istream& in, const RegEx& regex) const;
@ -63,6 +68,7 @@ namespace YAML

 		friend RegEx operator ! (const RegEx& ex);
 		friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
+		friend RegEx operator && (const RegEx& ex1, const RegEx& ex2);
 		friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);

 	private:
--- a/scanner.h
+++ b/scanner.h
@ -5,6 +5,7 @@
 #include <queue>
 #include <stack>
 #include <set>
+#include "regex.h"

 namespace YAML
 {
@ -44,6 +45,7 @@ namespace YAML
 		bool IsPlainScalar();

 		void GetBlockIndentation(int& indent, std::string& breaks);
+		std::string ScanScalar(RegEx end, bool eatEnd, int indent, char escape, bool fold, bool eatLeadingWhitespace, bool trimTrailingSpaces, int chomp);

 		struct SimpleKey {
 			SimpleKey(int pos_, int line_, int column_, int flowLevel_);
--- a/scanscalar.cpp
+++ b/scanscalar.cpp
@ -75,74 +75,77 @@ namespace YAML
 	//   and in-line whitespace (which is kept) separately.
 	template <> PlainScalarToken *Scanner::ScanToken(PlainScalarToken *pToken)
 	{
+		//// now eat and store the scalar
+		//std::string scalar;
+		//WhitespaceInfo info;
+
+		//while(INPUT) {
+		//	// doc start/end tokens
+		//	if(IsDocumentStart() || IsDocumentEnd())
+		//		break;
+
+		//	// comment
+		//	if(Exp::Comment.Matches(INPUT))
+		//		break;
+
+		//	// first eat non-blanks
+		//	while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
+		//		// illegal colon in flow context
+		//		if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(INPUT))
+		//			throw IllegalScalar();
+
+		//		// characters that might end the scalar
+		//		if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(INPUT))
+		//			break;
+		//		if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT))
+		//			break;
+
+		//		// finally, read the character!
+		//		scalar += GetChar();
+		//	}
+
+		//	// did we hit a non-blank character that ended us?
+		//	if(!Exp::BlankOrBreak.Matches(INPUT))
+		//		break;
+
+		//	// now eat blanks
+		//	while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) {
+		//		if(Exp::Blank.Matches(INPUT)) {
+		//			// can't use tabs as indentation! only spaces!
+		//			if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top())
+		//				throw IllegalTabInScalar();
+
+		//			info.AddBlank(GetChar());
+		//		} else	{
+		//			// we know it's a line break; see how many characters to read
+		//			int n = Exp::Break.Match(INPUT);
+		//			std::string line = GetChar(n);
+		//			info.AddBreak(line);
+
+		//			// and we can't continue a simple key to the next line
+		//			ValidateSimpleKey();
+		//		}
+		//	}
+
+		//	// break if we're below the indentation level
+		//	if(m_flowLevel == 0 && m_column <= m_indents.top())
+		//		break;
+
+		//	// finally join whitespace
+		//	scalar += info.Join();
+		//}
+
+		RegEx end = (m_flowLevel > 0 ? Exp::EndScalarInFlow : Exp::EndScalar) || (RegEx(' ') + Exp::Comment);
+		int indent = (m_flowLevel > 0 ? 0 : m_indents.top() + 1);
+
 		// insert a potential simple key
 		if(m_simpleKeyAllowed)
 			InsertSimpleKey();
+
+		pToken->value = ScanScalar(end, false, indent, 0, true, true, true, 0);
+
 		m_simpleKeyAllowed = false;
-
-		// now eat and store the scalar
-		std::string scalar;
-		WhitespaceInfo info;
-
-		while(INPUT) {
-			// doc start/end tokens
-			if(IsDocumentStart() || IsDocumentEnd())
-				break;
-
-			// comment
-			if(Exp::Comment.Matches(INPUT))
-				break;
-
-			// first eat non-blanks
-			while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
-				// illegal colon in flow context
-				if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(INPUT))
-					throw IllegalScalar();
-
-				// characters that might end the scalar
-				if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(INPUT))
-					break;
-				if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT))
-					break;
-
-				// finally, read the character!
-				scalar += GetChar();
-			}
-
-			// did we hit a non-blank character that ended us?
-			if(!Exp::BlankOrBreak.Matches(INPUT))
-				break;
-
-			// now eat blanks
-			while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) {
-				if(Exp::Blank.Matches(INPUT)) {
-					// can't use tabs as indentation! only spaces!
-					if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top())
-						throw IllegalTabInScalar();
-
-					info.AddBlank(GetChar());
-				} else	{
-					// we know it's a line break; see how many characters to read
-					int n = Exp::Break.Match(INPUT);
-					std::string line = GetChar(n);
-					info.AddBreak(line);
-
-					// and we can't continue a simple key to the next line
-					ValidateSimpleKey();
-				}
-			}
-
-			// break if we're below the indentation level
-			if(m_flowLevel == 0 && m_column <= m_indents.top())
-				break;
-
-			// finally join whitespace
-			scalar += info.Join();
-		}
-
-		// now modify our token
-		pToken->value = scalar;
-		if(info.leadingBlanks)
+		if(true/*info.leadingBlanks*/)
 			m_simpleKeyAllowed = true;

 		return pToken;
@ -151,91 +154,92 @@ namespace YAML
 	// QuotedScalarToken
 	template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken)
 	{
-		// insert a potential simple key
-		if(m_simpleKeyAllowed)
-			InsertSimpleKey();
-		m_simpleKeyAllowed = false;
+		//// now eat and store the scalar
+		//std::string scalar;
+		//WhitespaceInfo info;
+
+		//while(INPUT) {
+		//	if(IsDocumentStart() || IsDocumentEnd())
+		//		throw DocIndicatorInQuote();
+
+		//	if(INPUT.peek() == EOF)
+		//		throw EOFInQuote();
+
+		//	// first eat non-blanks
+		//	while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
+		//		// escaped single quote?
+		//		if(pToken->single && Exp::EscSingleQuote.Matches(INPUT)) {
+		//			int n = Exp::EscSingleQuote.Match(INPUT);
+		//			scalar += GetChar(n);
+		//			continue;
+		//		}
+
+		//		// is the quote ending?
+		//		if(INPUT.peek() == quote)
+		//			break;
+
+		//		// escaped newline?
+		//		if(Exp::EscBreak.Matches(INPUT))
+		//			break;
+
+		//		// other escape sequence
+		//		if(INPUT.peek() == '\\') {
+		//			int length = 0;
+		//			scalar += Exp::Escape(INPUT, length);
+		//			m_column += length;
+		//			continue;
+		//		}
+
+		//		// and finally, just add the damn character
+		//		scalar += GetChar();
+		//	}
+
+		//	// is the quote ending?
+		//	if(INPUT.peek() == quote) {
+		//		// eat and go
+		//		GetChar();
+		//		break;
+		//	}
+
+		//	// now we eat blanks
+		//	while(Exp::BlankOrBreak.Matches(INPUT)) {
+		//		if(Exp::Blank.Matches(INPUT)) {
+		//			info.AddBlank(GetChar());
+		//		} else {
+		//			// we know it's a line break; see how many characters to read
+		//			int n = Exp::Break.Match(INPUT);
+		//			std::string line = GetChar(n);
+		//			info.AddBreak(line);
+
+		//			// and we can't continue a simple key to the next line
+		//			ValidateSimpleKey();
+		//		}
+		//	}
+
+		//	// and finally join the whitespace
+		//	scalar += info.Join();
+		//}

 		// eat single or double quote
 		char quote = GetChar();
 		pToken->single = (quote == '\'');

-		// now eat and store the scalar
-		std::string scalar;
-		WhitespaceInfo info;
+		RegEx end = (pToken->single ? RegEx(quote) && !Exp::EscSingleQuote : RegEx(quote));
+		char escape = (pToken->single ? '\'' : '\\');

-		while(INPUT) {
-			if(IsDocumentStart() || IsDocumentEnd())
-				throw DocIndicatorInQuote();
+		// insert a potential simple key
+		if(m_simpleKeyAllowed)
+			InsertSimpleKey();

-			if(INPUT.peek() == EOF)
-				throw EOFInQuote();
+		pToken->value = ScanScalar(end, true, 0, escape, true, true, false, 0);
+		m_simpleKeyAllowed = false;

-			// first eat non-blanks
-			while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
-				// escaped single quote?
-				if(pToken->single && Exp::EscSingleQuote.Matches(INPUT)) {
-					int n = Exp::EscSingleQuote.Match(INPUT);
-					scalar += GetChar(n);
-					continue;
-				}
-
-				// is the quote ending?
-				if(INPUT.peek() == quote)
-					break;
-
-				// escaped newline?
-				if(Exp::EscBreak.Matches(INPUT))
-					break;
-
-				// other escape sequence
-				if(INPUT.peek() == '\\') {
-					int length = 0;
-					scalar += Exp::Escape(INPUT, length);
-					m_column += length;
-					continue;
-				}
-
-				// and finally, just add the damn character
-				scalar += GetChar();
-			}
-
-			// is the quote ending?
-			if(INPUT.peek() == quote) {
-				// eat and go
-				GetChar();
-				break;
-			}
-
-			// now we eat blanks
-			while(Exp::BlankOrBreak.Matches(INPUT)) {
-				if(Exp::Blank.Matches(INPUT)) {
-					info.AddBlank(GetChar());
-				} else {
-					// we know it's a line break; see how many characters to read
-					int n = Exp::Break.Match(INPUT);
-					std::string line = GetChar(n);
-					info.AddBreak(line);
-
-					// and we can't continue a simple key to the next line
-					ValidateSimpleKey();
-				}
-			}
-
-			// and finally join the whitespace
-			scalar += info.Join();
-		}
-
-		pToken->value = scalar;
 		return pToken;
 	}

 	// BlockScalarToken
 	template <> BlockScalarToken *Scanner::ScanToken(BlockScalarToken *pToken)
 	{
-		// simple keys always ok after block scalars (since we're gonna start a new line anyways)
-		m_simpleKeyAllowed = true;
-
 		WhitespaceInfo info;

 		// eat block indicator ('|' or '>')
@ -268,37 +272,13 @@ namespace YAML
 		if(info.increment && m_indents.top() >= 0)
 			indent += m_indents.top();

-		// finally, grab that scalar
-		std::string scalar;
-		while(INPUT) {
-			// initialize indentation
 		GetBlockIndentation(indent, info.trailingBreaks);

-			// are we done with this guy (i.e. at a lower indentation?)
-			if(m_column != indent)
-				break;
-
-			bool trailingBlank = Exp::Blank.Matches(INPUT);
-			scalar += info.Join();
-
-			bool leadingBlank = Exp::Blank.Matches(INPUT);
-
-			// now eat and save the line
-			while(INPUT.peek() != EOF && !Exp::Break.Matches(INPUT))
-				scalar += GetChar();
-
-			// we know it's a line break; see how many characters to read
-			int n = Exp::Break.Match(INPUT);
-			std::string line = GetChar(n);
-			info.AddBreak(line);
-		}
-
-		// one last whitespace join (with chompers this time)
-		scalar += info.Join(true);
-
-		// finally set the scalar
-		pToken->value = scalar;
+		bool eatLeadingWhitespace = false;
+		pToken->value = ScanScalar(RegEx(), false, indent, 0, info.fold, eatLeadingWhitespace, false, info.chomp);

+		// simple keys always ok after block scalars (since we're gonna start a new line anyways)
+		m_simpleKeyAllowed = true;
 		return pToken;
 	}

@ -340,4 +320,104 @@ namespace YAML
 				indent = 1;
 		}
 	}
+
+	// ScanScalar
+	std::string Scanner::ScanScalar(RegEx end, bool eatEnd, int indent, char escape, bool fold, bool eatLeadingWhitespace, bool trimTrailingSpaces, int chomp)
+	{
+		bool emptyLine = false, moreIndented = false;
+		std::string scalar;
+
+		while(INPUT) {
+			// ********************************
+			// Phase #1: scan until line ending
+			while(!end.Matches(INPUT) && !Exp::Break.Matches(INPUT)) {
+				if(INPUT.peek() == EOF)
+					break;
+
+				// escaped newline? (only if we're escaping on slash)
+				if(escape == '\\' && Exp::EscBreak.Matches(INPUT)) {
+					int n = Exp::EscBreak.Match(INPUT);
+					Eat(n);
+					continue;
+				}
+
+				// escape this?
+				if(INPUT.peek() == escape) {
+					int length = 0;
+					scalar += Exp::Escape(INPUT, length);
+					m_column += length;
+					continue;
+				}
+
+				// otherwise, just add the damn character
+				scalar += GetChar();
+			}
+
+			// eof? if we're looking to eat something, then we throw
+			if(INPUT.peek() == EOF) {
+				if(eatEnd)
+					throw EOFInQuote();
+				break;
+			}
+
+			// are we done via character match?
+			int n = end.Match(INPUT);
+			if(n >= 0) {
+				if(eatEnd)
+					Eat(n);
+				break;
+			}
+
+			// ********************************
+			// Phase #2: eat line ending
+			n = Exp::Break.Match(INPUT);
+			Eat(n);
+
+			// ********************************
+			// Phase #3: scan initial spaces
+
+			// first the required indentation
+			while(INPUT.peek() == ' ' && m_column < indent)
+				Eat(1);
+
+			// and then the rest of the whitespace
+			if(eatLeadingWhitespace) {
+				while(Exp::Blank.Matches(INPUT))
+					Eat(1);
+			}
+
+			// was this an empty line?
+			bool nextEmptyLine = Exp::Break.Matches(INPUT);
+			bool nextMoreIndented = (INPUT.peek() == ' ');
+
+			if(fold && !emptyLine && !nextEmptyLine && !moreIndented && !nextMoreIndented)
+				scalar += " ";
+			else
+				scalar += "\n"; 
+
+			emptyLine = nextEmptyLine;
+			moreIndented = nextMoreIndented;
+
+			// are we done via indentation?
+			if(!emptyLine && m_column < indent)
+				break;
+		}
+
+		// post-processing
+		if(trimTrailingSpaces) {
+			unsigned pos = scalar.find_last_not_of(' ');
+			if(pos < scalar.size())
+				scalar.erase(pos + 1);
+		}
+
+		if(chomp <= 0) {
+			unsigned pos = scalar.find_last_not_of('\n');
+			if(chomp == 0 && pos + 1 < scalar.size())
+				scalar.erase(pos + 2);
+			else if(chomp == -1 && pos < scalar.size())
+				scalar.erase(pos + 1);
+		}
+
+		return scalar;
+	}
 }
--- a/test.yaml
+++ b/test.yaml
@ -1,13 +1,4 @@
-people:
-  - &jsb
-    name: Jesse
-    age: 23
-  - &dab
-    name: 'Daniel'
-    age: 25
-  - &ncb
-    name: "Naftali"
-    age: 21
-students:
-  - *jsb
-  - *ncb
+---
+- "quoted scalar that contains
+---
+    the document start!"