Refactored common scalar scanning code (from plain, quoted, and block) to one function.

2025-07-04 22:11:26 +12:00 · 2008-06-29 05:45:41 +00:00 · 2008-06-29 05:45:41 +00:00 · 0d5a97bffe
commit 0d5a97bffe
parent 6c193d6fbd
6 changed files with 306 additions and 188 deletions
--- a/exp.cpp
+++ b/exp.cpp
@ -62,7 +62,7 @@ namespace YAML
 		}
 		// Escape
-		// . Escapes the sequence starting 'in' (it must begin with a '\')
+		// . Escapes the sequence starting 'in' (it must begin with a '\' or single quote)
 		//   and returns the result.
 		// . Fills 'length' with how many characters we ate.
 		// . Throws if it's an unknown escape character.
@ -72,10 +72,16 @@ namespace YAML
 			length = 2;
 			// eat slash
-			in.get();
+			char escape = in.get();
 			// switch on escape character
 			char ch = in.get();
 			// first do single quote, since it's easier
 			if(escape == '\'' && ch == '\'')
 				return "\'";
 			// now do the slash (we're not gonna check if it's a slash - you better pass one!)
 			switch(ch) {
 				case '0': return "\0";
 				case 'a': return "\x07";
--- a/regex.cpp
+++ b/regex.cpp
@ -53,6 +53,7 @@ namespace YAML
 			case REGEX_MATCH: m_pOp = new MatchOperator; break;
 			case REGEX_RANGE: m_pOp = new RangeOperator; break;
 			case REGEX_OR: m_pOp = new OrOperator; break;
 			case REGEX_AND: m_pOp = new AndOperator; break;
 			case REGEX_NOT: m_pOp = new NotOperator; break;
 			case REGEX_SEQ: m_pOp = new SeqOperator; break;
 		}
@ -80,19 +81,13 @@ namespace YAML
 	// . Returns the number of characters matched.
 	// . Returns -1 if no characters were matched (the reason for
 	//   not returning zero is that we may have an empty regex
-	//   which SHOULD be considered successfully matching nothing,
+	//   which is ALWAYS successful at matching zero characters).
 	//   but that of course matches zero characters).
 	int RegEx::Match(const std::string& str) const
 	{
 		if(!m_pOp)
-			return -1;
+			return 0;
 		return m_pOp->Match(str, *this);
 			//case REGEX_EMPTY:
 			//	if(str.empty())
 			//		return 0;
 			//	return -1;
 	}
 	// Match
@ -131,6 +126,14 @@ namespace YAML
 		return ret;
 	}
 	RegEx operator && (const RegEx& ex1, const RegEx& ex2)
 	{
 		RegEx ret(REGEX_AND);
 		ret.m_params.push_back(ex1);
 		ret.m_params.push_back(ex2);
 		return ret;
 	}
 	RegEx operator + (const RegEx& ex1, const RegEx& ex2)
 	{
 		RegEx ret(REGEX_SEQ);
@ -194,6 +197,36 @@ namespace YAML
 		return -1;
 	}
 	// AndOperator
 	// Note: 'AND' is a little funny, since we may be required to match things
 	//       of different lengths. If we find a match, we return the length of
 	//       the FIRST entry on the list.
 	int RegEx::AndOperator::Match(const std::string& str, const RegEx& regex) const
 	{
 		int first = -1;
 		for(unsigned i=0;i<regex.m_params.size();i++) {
 			int n = regex.m_params[i].Match(str);
 			if(n == -1)
 				return -1;
 			if(i == 0)
 				first = n;
 		}
 		return first;
 	}
 	int RegEx::AndOperator::Match(std::istream& in, const RegEx& regex) const
 	{
 		int first = -1;
 		for(unsigned i=0;i<regex.m_params.size();i++) {
 			int n = regex.m_params[i].Match(in);
 			if(n == -1)
 				return -1;
 			if(i == 0)
 				first = n;
 		}
 		return first;
 	}
 	// NotOperator
 	int RegEx::NotOperator::Match(const std::string& str, const RegEx& regex) const
 	{
--- a/regex.h
+++ b/regex.h
@ -6,7 +6,7 @@
 namespace YAML
 {
-	enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_NOT, REGEX_SEQ };
+	enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_AND, REGEX_NOT, REGEX_SEQ };
 	// simplified regular expressions
 	// . Only straightforward matches (no repeated characters)
@ -35,6 +35,11 @@ namespace YAML
 			virtual int Match(std::istream& in, const RegEx& regex) const;
 		};
 		struct AndOperator: public Operator {
 			virtual int Match(const std::string& str, const RegEx& regex) const;
 			virtual int Match(std::istream& in, const RegEx& regex) const;
 		};
 		struct NotOperator: public Operator {
 			virtual int Match(const std::string& str, const RegEx& regex) const;
 			virtual int Match(std::istream& in, const RegEx& regex) const;
@ -63,6 +68,7 @@ namespace YAML
 		friend RegEx operator ! (const RegEx& ex);
 		friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
 		friend RegEx operator && (const RegEx& ex1, const RegEx& ex2);
 		friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
 	private:
--- a/scanner.h
+++ b/scanner.h
@ -5,6 +5,7 @@
 #include <queue>
 #include <stack>
 #include <set>
 #include "regex.h"
 namespace YAML
 {
@ -44,6 +45,7 @@ namespace YAML
 		bool IsPlainScalar();
 		void GetBlockIndentation(int& indent, std::string& breaks);
 		std::string ScanScalar(RegEx end, bool eatEnd, int indent, char escape, bool fold, bool eatLeadingWhitespace, bool trimTrailingSpaces, int chomp);
 		struct SimpleKey {
 			SimpleKey(int pos_, int line_, int column_, int flowLevel_);
--- a/scanscalar.cpp
+++ b/scanscalar.cpp
@ -75,74 +75,77 @@ namespace YAML
 	//   and in-line whitespace (which is kept) separately.
 	template <> PlainScalarToken *Scanner::ScanToken(PlainScalarToken *pToken)
 	{
 		//// now eat and store the scalar
 		//std::string scalar;
 		//WhitespaceInfo info;
 		//while(INPUT) {
 		//	// doc start/end tokens
 		//	if(IsDocumentStart() || IsDocumentEnd())
 		//		break;
 		//	// comment
 		//	if(Exp::Comment.Matches(INPUT))
 		//		break;
 		//	// first eat non-blanks
 		//	while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
 		//		// illegal colon in flow context
 		//		if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(INPUT))
 		//			throw IllegalScalar();
 		//		// characters that might end the scalar
 		//		if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(INPUT))
 		//			break;
 		//		if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT))
 		//			break;
 		//		// finally, read the character!
 		//		scalar += GetChar();
 		//	}
 		//	// did we hit a non-blank character that ended us?
 		//	if(!Exp::BlankOrBreak.Matches(INPUT))
 		//		break;
 		//	// now eat blanks
 		//	while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) {
 		//		if(Exp::Blank.Matches(INPUT)) {
 		//			// can't use tabs as indentation! only spaces!
 		//			if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top())
 		//				throw IllegalTabInScalar();
 		//			info.AddBlank(GetChar());
 		//		} else	{
 		//			// we know it's a line break; see how many characters to read
 		//			int n = Exp::Break.Match(INPUT);
 		//			std::string line = GetChar(n);
 		//			info.AddBreak(line);
 		//			// and we can't continue a simple key to the next line
 		//			ValidateSimpleKey();
 		//		}
 		//	}
 		//	// break if we're below the indentation level
 		//	if(m_flowLevel == 0 && m_column <= m_indents.top())
 		//		break;
 		//	// finally join whitespace
 		//	scalar += info.Join();
 		//}
 		RegEx end = (m_flowLevel > 0 ? Exp::EndScalarInFlow : Exp::EndScalar) || (RegEx(' ') + Exp::Comment);
 		int indent = (m_flowLevel > 0 ? 0 : m_indents.top() + 1);
 		// insert a potential simple key
 		if(m_simpleKeyAllowed)
 			InsertSimpleKey();
 		pToken->value = ScanScalar(end, false, indent, 0, true, true, true, 0);
 		m_simpleKeyAllowed = false;
-
+		if(true/*info.leadingBlanks*/)
 		// now eat and store the scalar
 		std::string scalar;
 		WhitespaceInfo info;
 		while(INPUT) {
 			// doc start/end tokens
 			if(IsDocumentStart() || IsDocumentEnd())
 				break;
 			// comment
 			if(Exp::Comment.Matches(INPUT))
 				break;
 			// first eat non-blanks
 			while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
 				// illegal colon in flow context
 				if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(INPUT))
 					throw IllegalScalar();
 				// characters that might end the scalar
 				if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(INPUT))
 					break;
 				if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT))
 					break;
 				// finally, read the character!
 				scalar += GetChar();
 			}
 			// did we hit a non-blank character that ended us?
 			if(!Exp::BlankOrBreak.Matches(INPUT))
 				break;
 			// now eat blanks
 			while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) {
 				if(Exp::Blank.Matches(INPUT)) {
 					// can't use tabs as indentation! only spaces!
 					if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top())
 						throw IllegalTabInScalar();
 					info.AddBlank(GetChar());
 				} else	{
 					// we know it's a line break; see how many characters to read
 					int n = Exp::Break.Match(INPUT);
 					std::string line = GetChar(n);
 					info.AddBreak(line);
 					// and we can't continue a simple key to the next line
 					ValidateSimpleKey();
 				}
 			}
 			// break if we're below the indentation level
 			if(m_flowLevel == 0 && m_column <= m_indents.top())
 				break;
 			// finally join whitespace
 			scalar += info.Join();
 		}
 		// now modify our token
 		pToken->value = scalar;
 		if(info.leadingBlanks)
 			m_simpleKeyAllowed = true;
 		return pToken;
@ -151,91 +154,92 @@ namespace YAML
 	// QuotedScalarToken
 	template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken)
 	{
-		// insert a potential simple key
+		//// now eat and store the scalar
-		if(m_simpleKeyAllowed)
+		//std::string scalar;
-			InsertSimpleKey();
+		//WhitespaceInfo info;
-		m_simpleKeyAllowed = false;
+
 		//while(INPUT) {
 		//	if(IsDocumentStart() || IsDocumentEnd())
 		//		throw DocIndicatorInQuote();
 		//	if(INPUT.peek() == EOF)
 		//		throw EOFInQuote();
 		//	// first eat non-blanks
 		//	while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
 		//		// escaped single quote?
 		//		if(pToken->single && Exp::EscSingleQuote.Matches(INPUT)) {
 		//			int n = Exp::EscSingleQuote.Match(INPUT);
 		//			scalar += GetChar(n);
 		//			continue;
 		//		}
 		//		// is the quote ending?
 		//		if(INPUT.peek() == quote)
 		//			break;
 		//		// escaped newline?
 		//		if(Exp::EscBreak.Matches(INPUT))
 		//			break;
 		//		// other escape sequence
 		//		if(INPUT.peek() == '\\') {
 		//			int length = 0;
 		//			scalar += Exp::Escape(INPUT, length);
 		//			m_column += length;
 		//			continue;
 		//		}
 		//		// and finally, just add the damn character
 		//		scalar += GetChar();
 		//	}
 		//	// is the quote ending?
 		//	if(INPUT.peek() == quote) {
 		//		// eat and go
 		//		GetChar();
 		//		break;
 		//	}
 		//	// now we eat blanks
 		//	while(Exp::BlankOrBreak.Matches(INPUT)) {
 		//		if(Exp::Blank.Matches(INPUT)) {
 		//			info.AddBlank(GetChar());
 		//		} else {
 		//			// we know it's a line break; see how many characters to read
 		//			int n = Exp::Break.Match(INPUT);
 		//			std::string line = GetChar(n);
 		//			info.AddBreak(line);
 		//			// and we can't continue a simple key to the next line
 		//			ValidateSimpleKey();
 		//		}
 		//	}
 		//	// and finally join the whitespace
 		//	scalar += info.Join();
 		//}
 		// eat single or double quote
 		char quote = GetChar();
 		pToken->single = (quote == '\'');
-		// now eat and store the scalar
+		RegEx end = (pToken->single ? RegEx(quote) && !Exp::EscSingleQuote : RegEx(quote));
-		std::string scalar;
+		char escape = (pToken->single ? '\'' : '\\');
 		WhitespaceInfo info;
-		while(INPUT) {
+		// insert a potential simple key
-			if(IsDocumentStart() || IsDocumentEnd())
+		if(m_simpleKeyAllowed)
-				throw DocIndicatorInQuote();
+			InsertSimpleKey();
-			if(INPUT.peek() == EOF)
+		pToken->value = ScanScalar(end, true, 0, escape, true, true, false, 0);
-				throw EOFInQuote();
+		m_simpleKeyAllowed = false;
 			// first eat non-blanks
 			while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
 				// escaped single quote?
 				if(pToken->single && Exp::EscSingleQuote.Matches(INPUT)) {
 					int n = Exp::EscSingleQuote.Match(INPUT);
 					scalar += GetChar(n);
 					continue;
 				}
 				// is the quote ending?
 				if(INPUT.peek() == quote)
 					break;
 				// escaped newline?
 				if(Exp::EscBreak.Matches(INPUT))
 					break;
 				// other escape sequence
 				if(INPUT.peek() == '\\') {
 					int length = 0;
 					scalar += Exp::Escape(INPUT, length);
 					m_column += length;
 					continue;
 				}
 				// and finally, just add the damn character
 				scalar += GetChar();
 			}
 			// is the quote ending?
 			if(INPUT.peek() == quote) {
 				// eat and go
 				GetChar();
 				break;
 			}
 			// now we eat blanks
 			while(Exp::BlankOrBreak.Matches(INPUT)) {
 				if(Exp::Blank.Matches(INPUT)) {
 					info.AddBlank(GetChar());
 				} else {
 					// we know it's a line break; see how many characters to read
 					int n = Exp::Break.Match(INPUT);
 					std::string line = GetChar(n);
 					info.AddBreak(line);
 					// and we can't continue a simple key to the next line
 					ValidateSimpleKey();
 				}
 			}
 			// and finally join the whitespace
 			scalar += info.Join();
 		}
 		pToken->value = scalar;
 		return pToken;
 	}
 	// BlockScalarToken
 	template <> BlockScalarToken *Scanner::ScanToken(BlockScalarToken *pToken)
 	{
 		// simple keys always ok after block scalars (since we're gonna start a new line anyways)
 		m_simpleKeyAllowed = true;
 		WhitespaceInfo info;
 		// eat block indicator ('|' or '>')
@ -268,37 +272,13 @@ namespace YAML
 		if(info.increment && m_indents.top() >= 0)
 			indent += m_indents.top();
-		// finally, grab that scalar
+		GetBlockIndentation(indent, info.trailingBreaks);
 		std::string scalar;
 		while(INPUT) {
 			// initialize indentation
 			GetBlockIndentation(indent, info.trailingBreaks);
-			// are we done with this guy (i.e. at a lower indentation?)
+		bool eatLeadingWhitespace = false;
-			if(m_column != indent)
+		pToken->value = ScanScalar(RegEx(), false, indent, 0, info.fold, eatLeadingWhitespace, false, info.chomp);
 				break;
 			bool trailingBlank = Exp::Blank.Matches(INPUT);
 			scalar += info.Join();
 			bool leadingBlank = Exp::Blank.Matches(INPUT);
 			// now eat and save the line
 			while(INPUT.peek() != EOF && !Exp::Break.Matches(INPUT))
 				scalar += GetChar();
 			// we know it's a line break; see how many characters to read
 			int n = Exp::Break.Match(INPUT);
 			std::string line = GetChar(n);
 			info.AddBreak(line);
 		}
 		// one last whitespace join (with chompers this time)
 		scalar += info.Join(true);
 		// finally set the scalar
 		pToken->value = scalar;
 		// simple keys always ok after block scalars (since we're gonna start a new line anyways)
 		m_simpleKeyAllowed = true;
 		return pToken;
 	}
@ -340,4 +320,104 @@ namespace YAML
 				indent = 1;
 		}
 	}
 	// ScanScalar
 	std::string Scanner::ScanScalar(RegEx end, bool eatEnd, int indent, char escape, bool fold, bool eatLeadingWhitespace, bool trimTrailingSpaces, int chomp)
 	{
 		bool emptyLine = false, moreIndented = false;
 		std::string scalar;
 		while(INPUT) {
 			// ********************************
 			// Phase #1: scan until line ending
 			while(!end.Matches(INPUT) && !Exp::Break.Matches(INPUT)) {
 				if(INPUT.peek() == EOF)
 					break;
 				// escaped newline? (only if we're escaping on slash)
 				if(escape == '\\' && Exp::EscBreak.Matches(INPUT)) {
 					int n = Exp::EscBreak.Match(INPUT);
 					Eat(n);
 					continue;
 				}
 				// escape this?
 				if(INPUT.peek() == escape) {
 					int length = 0;
 					scalar += Exp::Escape(INPUT, length);
 					m_column += length;
 					continue;
 				}
 				// otherwise, just add the damn character
 				scalar += GetChar();
 			}
 			// eof? if we're looking to eat something, then we throw
 			if(INPUT.peek() == EOF) {
 				if(eatEnd)
 					throw EOFInQuote();
 				break;
 			}
 			// are we done via character match?
 			int n = end.Match(INPUT);
 			if(n >= 0) {
 				if(eatEnd)
 					Eat(n);
 				break;
 			}
 			// ********************************
 			// Phase #2: eat line ending
 			n = Exp::Break.Match(INPUT);
 			Eat(n);
 			// ********************************
 			// Phase #3: scan initial spaces
 			// first the required indentation
 			while(INPUT.peek() == ' ' && m_column < indent)
 				Eat(1);
 			// and then the rest of the whitespace
 			if(eatLeadingWhitespace) {
 				while(Exp::Blank.Matches(INPUT))
 					Eat(1);
 			}
 			// was this an empty line?
 			bool nextEmptyLine = Exp::Break.Matches(INPUT);
 			bool nextMoreIndented = (INPUT.peek() == ' ');
 			if(fold && !emptyLine && !nextEmptyLine && !moreIndented && !nextMoreIndented)
 				scalar += " ";
 			else
 				scalar += "\n"; 
 			emptyLine = nextEmptyLine;
 			moreIndented = nextMoreIndented;
 			// are we done via indentation?
 			if(!emptyLine && m_column < indent)
 				break;
 		}
 		// post-processing
 		if(trimTrailingSpaces) {
 			unsigned pos = scalar.find_last_not_of(' ');
 			if(pos < scalar.size())
 				scalar.erase(pos + 1);
 		}
 		if(chomp <= 0) {
 			unsigned pos = scalar.find_last_not_of('\n');
 			if(chomp == 0 && pos + 1 < scalar.size())
 				scalar.erase(pos + 2);
 			else if(chomp == -1 && pos < scalar.size())
 				scalar.erase(pos + 1);
 		}
 		return scalar;
 	}
 }
--- a/test.yaml
+++ b/test.yaml
@ -1,13 +1,4 @@
-people:
+---
-  - &jsb
+- "quoted scalar that contains
-    name: Jesse
+---
-    age: 23
+    the document start!"
  - &dab
    name: 'Daniel'
    age: 25
  - &ncb
    name: "Naftali"
    age: 21
 students:
  - *jsb
  - *ncb