Mostly finished refactoring the scalar scanning.

2025-07-05 06:21:26 +12:00 · 2008-06-30 01:31:23 +00:00 · 2008-06-30 01:31:23 +00:00 · 0683cbf859
commit 0683cbf859
parent 5f8252ee6f
6 changed files with 192 additions and 378 deletions
--- a/exceptions.h
+++ b/exceptions.h
@ -11,10 +11,10 @@ namespace YAML
 	class IllegalMapKey: public Exception {};
 	class IllegalMapValue: public Exception {};
 	class IllegalScalar: public Exception {};
-	class IllegalTabInScalar: public Exception {};
+	class IllegalTabInIndentation: public Exception {};
 	class IllegalFlowEnd: public Exception {};
-	class DocIndicatorInQuote: public Exception {};
+	class IllegalDocIndicator: public Exception {};
-	class EOFInQuote: public Exception {};
+	class IllegalEOF: public Exception {};
 	class RequiredSimpleKeyNotFound: public Exception {};
 	class ZeroIndentationInBlockScalar: public Exception {};
 	class UnexpectedCharacterInBlockScalar: public Exception {};
--- a/exp.h
+++ b/exp.h
@ -25,6 +25,7 @@ namespace YAML
 		const RegEx DocStart = RegEx("---") + (BlankOrBreak || RegEx(EOF) || RegEx());
 		const RegEx DocEnd = RegEx("...") + (BlankOrBreak || RegEx(EOF) || RegEx());
 		const RegEx DocIndicator = DocStart || DocEnd;
 		const RegEx BlockEntry = RegEx('-') + (BlankOrBreak || RegEx(EOF));
 		const RegEx Key = RegEx('?'),
 		            KeyInFlow = RegEx('?') + BlankOrBreak;
--- a/scanner.h
+++ b/scanner.h
@ -19,19 +19,20 @@ namespace YAML
 		~Scanner();
 		Token *GetNextToken();
 		void Scan();
 	private:
 		// scanning
 		void ScanNextToken();
 		void ScanToNextToken();
 		Token *PushIndentTo(int column, bool sequence);
 		void PopIndentTo(int column);
 		// checking input
 		void InsertSimpleKey();
 		bool VerifySimpleKey();
 		void VerifyAllSimpleKeys();
 		void Scan();
 	private:
 		bool IsWhitespaceToBeEaten(char ch);
 		bool IsDocumentStart();
 		bool IsDocumentEnd();
--- a/scanscalar.cpp
+++ b/scanscalar.cpp
@ -6,345 +6,13 @@
 namespace YAML
 {
 	//////////////////////////////////////////////////////////
 	// WhitespaceInfo
 	WhitespaceInfo::WhitespaceInfo(): leadingBlanks(false), fold(true), chomp(0), increment(0)
 	{
 	}
 	void WhitespaceInfo::SetChompers(char ch)
 	{
 		if(ch == '+')
 			chomp = 1;
 		else if(ch == '-')
 			chomp = -1;
 		else if(Exp::Digit.Matches(ch)) {
 			increment = ch - '0';
 			if(increment == 0)
 				throw ZeroIndentationInBlockScalar();
 		}
 	}
 	void WhitespaceInfo::AddBlank(char ch)
 	{
 		if(!leadingBlanks)
 			whitespace += ch;
 	}
 	void WhitespaceInfo::AddBreak(const std::string& line)
 	{
 		// where to store this character?
 		if(!leadingBlanks) {
 			leadingBlanks = true;
 			whitespace = "";
 			leadingBreaks += line;
 		} else
 			trailingBreaks += line;
 	}
 	std::string WhitespaceInfo::Join(bool lastLine)
 	{
 		std::string ret;
 		if(leadingBlanks) {
 			// fold line break?
 			if(fold && Exp::Break.Matches(leadingBreaks) && trailingBreaks.empty() && !lastLine)
 				ret = " ";
 			else if(!lastLine || chomp != -1)
 				ret = leadingBreaks;
 			if(!lastLine || chomp == 1)
 				ret += trailingBreaks;
 			leadingBlanks = false;
 			leadingBreaks = "";
 			trailingBreaks = "";
 		} else if(!whitespace.empty()) {
 			ret = whitespace;
 			whitespace = "";
 		}
 		return ret;
 	}
 	// PlainScalarToken
 	// . We scan these in passes of two steps each: First, grab all non-whitespace
 	//   characters we can, and then grab all whitespace characters we can.
 	// . This has the benefit of letting us handle leading whitespace (which is chomped)
 	//   and in-line whitespace (which is kept) separately.
 	template <> PlainScalarToken *Scanner::ScanToken(PlainScalarToken *pToken)
 	{
 		//// now eat and store the scalar
 		//std::string scalar;
 		//WhitespaceInfo info;
 		//while(INPUT) {
 		//	// doc start/end tokens
 		//	if(IsDocumentStart() || IsDocumentEnd())
 		//		break;
 		//	// comment
 		//	if(Exp::Comment.Matches(INPUT))
 		//		break;
 		//	// first eat non-blanks
 		//	while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
 		//		// illegal colon in flow context
 		//		if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(INPUT))
 		//			throw IllegalScalar();
 		//		// characters that might end the scalar
 		//		if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(INPUT))
 		//			break;
 		//		if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT))
 		//			break;
 		//		// finally, read the character!
 		//		scalar += GetChar();
 		//	}
 		//	// did we hit a non-blank character that ended us?
 		//	if(!Exp::BlankOrBreak.Matches(INPUT))
 		//		break;
 		//	// now eat blanks
 		//	while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) {
 		//		if(Exp::Blank.Matches(INPUT)) {
 		//			// can't use tabs as indentation! only spaces!
 		//			if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top())
 		//				throw IllegalTabInScalar();
 		//			info.AddBlank(GetChar());
 		//		} else	{
 		//			// we know it's a line break; see how many characters to read
 		//			int n = Exp::Break.Match(INPUT);
 		//			std::string line = GetChar(n);
 		//			info.AddBreak(line);
 		//			// and we can't continue a simple key to the next line
 		//			ValidateSimpleKey();
 		//		}
 		//	}
 		//	// break if we're below the indentation level
 		//	if(m_flowLevel == 0 && m_column <= m_indents.top())
 		//		break;
 		//	// finally join whitespace
 		//	scalar += info.Join();
 		//}
 		ScanScalarInfo info;
 		info.end = (m_flowLevel > 0 ? Exp::EndScalarInFlow : Exp::EndScalar) || (RegEx(' ') + Exp::Comment);
 		info.eatEnd = false;
 		info.indent = (m_flowLevel > 0 ? 0 : m_indents.top() + 1);
 		info.fold = true;
 		info.eatLeadingWhitespace = true;
 		info.trimTrailingSpaces = true;
 		info.chomp = CLIP;
 		// insert a potential simple key
 		if(m_simpleKeyAllowed)
 			InsertSimpleKey();
 		pToken->value = ScanScalar(INPUT, info);
 		m_simpleKeyAllowed = false;
 		if(true/*info.leadingBlanks*/)
 			m_simpleKeyAllowed = true;
 		return pToken;
 	}
 	// QuotedScalarToken
 	template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken)
 	{
 		//// now eat and store the scalar
 		//std::string scalar;
 		//WhitespaceInfo info;
 		//while(INPUT) {
 		//	if(IsDocumentStart() || IsDocumentEnd())
 		//		throw DocIndicatorInQuote();
 		//	if(INPUT.peek() == EOF)
 		//		throw EOFInQuote();
 		//	// first eat non-blanks
 		//	while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
 		//		// escaped single quote?
 		//		if(pToken->single && Exp::EscSingleQuote.Matches(INPUT)) {
 		//			int n = Exp::EscSingleQuote.Match(INPUT);
 		//			scalar += GetChar(n);
 		//			continue;
 		//		}
 		//		// is the quote ending?
 		//		if(INPUT.peek() == quote)
 		//			break;
 		//		// escaped newline?
 		//		if(Exp::EscBreak.Matches(INPUT))
 		//			break;
 		//		// other escape sequence
 		//		if(INPUT.peek() == '\\') {
 		//			int length = 0;
 		//			scalar += Exp::Escape(INPUT, length);
 		//			m_column += length;
 		//			continue;
 		//		}
 		//		// and finally, just add the damn character
 		//		scalar += GetChar();
 		//	}
 		//	// is the quote ending?
 		//	if(INPUT.peek() == quote) {
 		//		// eat and go
 		//		GetChar();
 		//		break;
 		//	}
 		//	// now we eat blanks
 		//	while(Exp::BlankOrBreak.Matches(INPUT)) {
 		//		if(Exp::Blank.Matches(INPUT)) {
 		//			info.AddBlank(GetChar());
 		//		} else {
 		//			// we know it's a line break; see how many characters to read
 		//			int n = Exp::Break.Match(INPUT);
 		//			std::string line = GetChar(n);
 		//			info.AddBreak(line);
 		//			// and we can't continue a simple key to the next line
 		//			ValidateSimpleKey();
 		//		}
 		//	}
 		//	// and finally join the whitespace
 		//	scalar += info.Join();
 		//}
 		// eat single or double quote
 		char quote = INPUT.GetChar();
 		pToken->single = (quote == '\'');
 		ScanScalarInfo info;
 		info.end = (pToken->single ? RegEx(quote) && !Exp::EscSingleQuote : RegEx(quote));
 		info.eatEnd = true;
 		info.escape = (pToken->single ? '\'' : '\\');
 		info.indent = 0;
 		info.fold = true;
 		info.eatLeadingWhitespace = true;
 		info.trimTrailingSpaces = false;
 		info.chomp = CLIP;
 		// insert a potential simple key
 		if(m_simpleKeyAllowed)
 			InsertSimpleKey();
 		pToken->value = ScanScalar(INPUT, info);
 		m_simpleKeyAllowed = false;
 		return pToken;
 	}
 	// BlockScalarToken
 	template <> BlockScalarToken *Scanner::ScanToken(BlockScalarToken *pToken)
 	{
 		WhitespaceInfo info;
 		// eat block indicator ('|' or '>')
 		char indicator = INPUT.GetChar();
 		info.fold = (indicator == Keys::FoldedScalar);
 		// eat chomping/indentation indicators
 		int n = Exp::Chomp.Match(INPUT);
 		for(int i=0;i<n;i++)
 			info.SetChompers(INPUT.GetChar());
 		// first eat whitespace
 		while(Exp::Blank.Matches(INPUT))
 			INPUT.Eat(1);
 		// and comments to the end of the line
 		if(Exp::Comment.Matches(INPUT))
 			while(INPUT && !Exp::Break.Matches(INPUT))
 				INPUT.Eat(1);
 		// if it's not a line break, then we ran into a bad character inline
 		if(INPUT && !Exp::Break.Matches(INPUT))
 			throw UnexpectedCharacterInBlockScalar();
 		// and eat that baby
 		INPUT.EatLineBreak();
 		// set the initial indentation
 		int indent = info.increment;
 		if(info.increment && m_indents.top() >= 0)
 			indent += m_indents.top();
 		GetBlockIndentation(INPUT, indent, info.trailingBreaks, m_indents.top());
 		ScanScalarInfo sinfo;
 		sinfo.indent = indent;
 		sinfo.fold = info.fold;
 		sinfo.eatLeadingWhitespace = false;
 		sinfo.trimTrailingSpaces = false;
 		sinfo.chomp = (CHOMP) info.chomp;
 		pToken->value = ScanScalar(INPUT, sinfo);
 		// simple keys always ok after block scalars (since we're gonna start a new line anyways)
 		m_simpleKeyAllowed = true;
 		return pToken;
 	}
 	// GetBlockIndentation
 	// . Helper to scanning a block scalar.
 	// . Eats leading *indentation* zeros (i.e., those that come before 'indent'),
 	//   and updates 'indent' (if it hasn't been set yet).
 	void GetBlockIndentation(Stream& INPUT, int& indent, std::string& breaks, int topIndent)
 	{
 		int maxIndent = 0;
 		while(1) {
 			// eat as many indentation spaces as we can
 			while((indent == 0 || INPUT.column < indent) && INPUT.peek() == ' ')
 				INPUT.Eat(1);
 			if(INPUT.column > maxIndent)
 				maxIndent = INPUT.column;
 			// do we need more indentation, but we've got a tab?
 			if((indent == 0 || INPUT.column < indent) && INPUT.peek() == '\t')
 				throw IllegalTabInScalar();   // TODO: are literal scalar lines allowed to have tabs here?
 			// is this a non-empty line?
 			if(!Exp::Break.Matches(INPUT))
 				break;
 			// otherwise, eat the line break and move on
 			int n = Exp::Break.Match(INPUT);
 			breaks += INPUT.GetChar(n);
 		}
 		// finally, set the indentation
 		if(indent == 0) {
 			indent = maxIndent;
 			if(indent < topIndent + 1)
 				indent = topIndent + 1;
 			if(indent < 1)
 				indent = 1;
 		}
 	}
 	// ScanScalar
-	std::string ScanScalar(Stream& INPUT, ScanScalarInfo info)
+	std::string ScanScalar(Stream& INPUT, ScanScalarInfo& info)
 	{
 		bool foundNonEmptyLine = false;
 		bool emptyLine = false, moreIndented = false;
 		std::string scalar;
 		info.leadingSpaces = false;
 		while(INPUT) {
 			// ********************************
@ -353,6 +21,16 @@ namespace YAML
 				if(INPUT.peek() == EOF)
 					break;
 				// document indicator?
 				if(INPUT.column == 0 && Exp::DocIndicator.Matches(INPUT)) {
 					if(info.onDocIndicator == BREAK)
 						break;
 					else if(info.onDocIndicator == THROW)
 						throw IllegalDocIndicator();
 				}
 				foundNonEmptyLine = true;
 				// escaped newline? (only if we're escaping on slash)
 				if(info.escape == '\\' && Exp::EscBreak.Matches(INPUT)) {
 					int n = Exp::EscBreak.Match(INPUT);
@ -373,10 +51,14 @@ namespace YAML
 			// eof? if we're looking to eat something, then we throw
 			if(INPUT.peek() == EOF) {
 				if(info.eatEnd)
-					throw EOFInQuote();
+					throw IllegalEOF();
 				break;
 			}
 			// doc indicator?
 			if(info.onDocIndicator == BREAK && INPUT.column == 0 && Exp::DocIndicator.Matches(INPUT))
 				break;
 			// are we done via character match?
 			int n = info.end.Match(INPUT);
 			if(n >= 0) {
@ -394,12 +76,22 @@ namespace YAML
 			// Phase #3: scan initial spaces
 			// first the required indentation
-			while(INPUT.peek() == ' ' && INPUT.column < info.indent)
+			while(INPUT.peek() == ' ' && (INPUT.column < info.indent || (info.detectIndent && !foundNonEmptyLine)))
 				INPUT.Eat(1);
 			// update indent if we're auto-detecting
 			if(info.detectIndent && !foundNonEmptyLine)
 				info.indent = std::max(info.indent, INPUT.column);
 			// and then the rest of the whitespace
-			if(info.eatLeadingWhitespace) {
+			while(Exp::Blank.Matches(INPUT)) {
-				while(Exp::Blank.Matches(INPUT))
+				// we check for tabs that masquerade as indentation
 				if(INPUT.peek() == '\t'&& INPUT.column < info.indent && info.onTabInIndentation == THROW)
 					throw IllegalTabInIndentation();
 				if(!info.eatLeadingWhitespace)
 					break;
 				INPUT.Eat(1);
 			}
@ -407,6 +99,8 @@ namespace YAML
 			bool nextEmptyLine = Exp::Break.Matches(INPUT);
 			bool nextMoreIndented = (INPUT.peek() == ' ');
 			// TODO: for block scalars, we always start with a newline, so we should fold OR keep that
 			if(info.fold && !emptyLine && !nextEmptyLine && !moreIndented && !nextMoreIndented)
 				scalar += " ";
 			else
@ -416,9 +110,11 @@ namespace YAML
 			moreIndented = nextMoreIndented;
 			// are we done via indentation?
-			if(!emptyLine && INPUT.column < info.indent)
+			if(!emptyLine && INPUT.column < info.indent) {
 				info.leadingSpaces = true;
 				break;
 			}
 		}
 		// post-processing
 		if(info.trimTrailingSpaces) {
@ -437,4 +133,123 @@ namespace YAML
 		return scalar;
 	}
 	// PlainScalarToken
 	template <> PlainScalarToken *Scanner::ScanToken(PlainScalarToken *pToken)
 	{
 		// set up the scanning parameters
 		ScanScalarInfo info;
 		info.end = (m_flowLevel > 0 ? Exp::EndScalarInFlow : Exp::EndScalar) || (RegEx(' ') + Exp::Comment);
 		info.eatEnd = false;
 		info.indent = (m_flowLevel > 0 ? 0 : m_indents.top() + 1);
 		info.fold = true;
 		info.eatLeadingWhitespace = true;
 		info.trimTrailingSpaces = true;
 		info.chomp = CLIP;
 		info.onDocIndicator = BREAK;
 		info.onTabInIndentation = THROW;
 		// insert a potential simple key
 		if(m_simpleKeyAllowed)
 			InsertSimpleKey();
 		pToken->value = ScanScalar(INPUT, info);
 		// can have a simple key only if we ended the scalar by starting a new line
 		m_simpleKeyAllowed = info.leadingSpaces;
 		// finally, we can't have any colons in a scalar, so if we ended on a colon, there
 		// had better be a break after it
 		if(Exp::IllegalColonInScalar.Matches(INPUT))
 			throw IllegalScalar();
 		return pToken;
 	}
 	// QuotedScalarToken
 	template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken)
 	{
 		// eat single or double quote
 		char quote = INPUT.GetChar();
 		pToken->single = (quote == '\'');
 		// setup the scanning parameters
 		ScanScalarInfo info;
 		info.end = (pToken->single ? RegEx(quote) && !Exp::EscSingleQuote : RegEx(quote));
 		info.eatEnd = true;
 		info.escape = (pToken->single ? '\'' : '\\');
 		info.indent = 0;
 		info.fold = true;
 		info.eatLeadingWhitespace = true;
 		info.trimTrailingSpaces = false;
 		info.chomp = CLIP;
 		info.onDocIndicator = THROW;
 		// insert a potential simple key
 		if(m_simpleKeyAllowed)
 			InsertSimpleKey();
 		pToken->value = ScanScalar(INPUT, info);
 		m_simpleKeyAllowed = false;
 		return pToken;
 	}
 	// BlockScalarToken
 	// . These need a little extra processing beforehand.
 	// . We need to scan the line where the indicator is (this doesn't count as part of the scalar),
 	//   and then we need to figure out what level of indentation we'll be using.
 	template <> BlockScalarToken *Scanner::ScanToken(BlockScalarToken *pToken)
 	{
 		ScanScalarInfo info;
 		info.indent = 1;
 		info.detectIndent = true;
 		// eat block indicator ('|' or '>')
 		char indicator = INPUT.GetChar();
 		info.fold = (indicator == Keys::FoldedScalar);
 		// eat chomping/indentation indicators
 		int n = Exp::Chomp.Match(INPUT);
 		for(int i=0;i<n;i++) {
 			char ch = INPUT.GetChar();
 			if(ch == '+')
 				info.chomp = KEEP;
 			else if(ch == '-')
 				info.chomp = STRIP;
 			else if(Exp::Digit.Matches(ch)) {
 				info.indent = ch - '0';
 				info.detectIndent = false;
 				if(info.indent == 0)
 					throw ZeroIndentationInBlockScalar();
 			}
 		}
 		// now eat whitespace
 		while(Exp::Blank.Matches(INPUT))
 			INPUT.Eat(1);
 		// and comments to the end of the line
 		if(Exp::Comment.Matches(INPUT))
 			while(INPUT && !Exp::Break.Matches(INPUT))
 				INPUT.Eat(1);
 		// if it's not a line break, then we ran into a bad character inline
 		if(INPUT && !Exp::Break.Matches(INPUT))
 			throw UnexpectedCharacterInBlockScalar();
 		// set the initial indentation
 		if(m_indents.top() >= 0)
 			info.indent += m_indents.top();
 		info.eatLeadingWhitespace = false;
 		info.trimTrailingSpaces = false;
 		info.onTabInIndentation = THROW;
 		pToken->value = ScanScalar(INPUT, info);
 		// simple keys always ok after block scalars (since we're gonna start a new line anyways)
 		m_simpleKeyAllowed = true;
 		return pToken;
 	}
 }
--- a/scanscalar.h
+++ b/scanscalar.h
@ -7,35 +7,29 @@
 namespace YAML
 {
 	enum CHOMP { STRIP = -1, CLIP, KEEP };
 	enum ACTION { NONE, BREAK, THROW };
 	struct ScanScalarInfo {
-		ScanScalarInfo(): eatEnd(false), indent(0), eatLeadingWhitespace(0), escape(0), fold(false), trimTrailingSpaces(0), chomp(CLIP) {}
+		ScanScalarInfo(): eatEnd(false), indent(0), detectIndent(false), eatLeadingWhitespace(0), escape(0), fold(false),
 			trimTrailingSpaces(0), chomp(CLIP), onDocIndicator(NONE), onTabInIndentation(NONE), leadingSpaces(false) {}
 		// input:
 		RegEx end;                      // what condition ends this scalar?
 		bool eatEnd;                    // should we eat that condition when we see it?
 		int indent;                     // what level of indentation should be eaten and ignored?
 		bool detectIndent;              // should we try to autodetect the indent?
 		bool eatLeadingWhitespace;      // should we continue eating this delicious indentation after 'indent' spaces?
 		char escape;                    // what character do we escape on (i.e., slash or single quote) (0 for none)
 		bool fold;                      // do we fold line ends?
 		bool trimTrailingSpaces;        // do we remove all trailing spaces (at the very end)
 		CHOMP chomp;                    // do we strip, clip, or keep trailing newlines (at the very end)
 		                                //   Note: strip means kill all, clip means keep at most one, keep means keep all
 		ACTION onDocIndicator;          // what do we do if we see a document indicator?
 		ACTION onTabInIndentation;      // what do we do if we see a tab where we should be seeing indentation spaces
 		// output:
 		bool leadingSpaces;
 	};
-	void GetBlockIndentation(Stream& INPUT, int& indent, std::string& breaks, int topIndent);
+	std::string ScanScalar(Stream& INPUT, ScanScalarInfo& info);
 	std::string ScanScalar(Stream& INPUT, ScanScalarInfo info);
 	struct WhitespaceInfo {
 		WhitespaceInfo();
 		void SetChompers(char ch);
 		void AddBlank(char ch);
 		void AddBreak(const std::string& line);
 		std::string Join(bool lastline = false);
 		bool leadingBlanks;
 		bool fold;
 		std::string whitespace, leadingBreaks, trailingBreaks;
 		int chomp, increment;
 	};
 }
--- a/test.yaml
+++ b/test.yaml
@ -1,14 +1,17 @@
 ---
- "quoted scalar\twith a tab\nand a newline"
+- here's a key: value
- 'This is Jesse''s single quote!'
+  here's the first block: |
- |
+  after the block: value
-  here's a literal:
+  and here's a block: |-
-  #include <iostream>
+    What's going on?
    How are you doing?
    Here's some code:
      #include <iostream>
      int main()
      {
         std::cout << "Hello World!\n";
     return 0;
      }
- key1: value1
+    
-  key2: value2
+    I'm doing fine!
  and last key: value