diff --git a/exceptions.h b/exceptions.h index f8968b7272..3c66d85ac8 100644 --- a/exceptions.h +++ b/exceptions.h @@ -11,10 +11,10 @@ namespace YAML class IllegalMapKey: public Exception {}; class IllegalMapValue: public Exception {}; class IllegalScalar: public Exception {}; - class IllegalTabInScalar: public Exception {}; + class IllegalTabInIndentation: public Exception {}; class IllegalFlowEnd: public Exception {}; - class DocIndicatorInQuote: public Exception {}; - class EOFInQuote: public Exception {}; + class IllegalDocIndicator: public Exception {}; + class IllegalEOF: public Exception {}; class RequiredSimpleKeyNotFound: public Exception {}; class ZeroIndentationInBlockScalar: public Exception {}; class UnexpectedCharacterInBlockScalar: public Exception {}; diff --git a/exp.h b/exp.h index 85d9520f57..ea1804b49f 100644 --- a/exp.h +++ b/exp.h @@ -25,6 +25,7 @@ namespace YAML const RegEx DocStart = RegEx("---") + (BlankOrBreak || RegEx(EOF) || RegEx()); const RegEx DocEnd = RegEx("...") + (BlankOrBreak || RegEx(EOF) || RegEx()); + const RegEx DocIndicator = DocStart || DocEnd; const RegEx BlockEntry = RegEx('-') + (BlankOrBreak || RegEx(EOF)); const RegEx Key = RegEx('?'), KeyInFlow = RegEx('?') + BlankOrBreak; diff --git a/scanner.h b/scanner.h index ec6173be63..fd5ad1a87c 100644 --- a/scanner.h +++ b/scanner.h @@ -19,19 +19,20 @@ namespace YAML ~Scanner(); Token *GetNextToken(); + void Scan(); + private: + // scanning void ScanNextToken(); void ScanToNextToken(); Token *PushIndentTo(int column, bool sequence); void PopIndentTo(int column); + // checking input void InsertSimpleKey(); bool VerifySimpleKey(); void VerifyAllSimpleKeys(); - void Scan(); - - private: bool IsWhitespaceToBeEaten(char ch); bool IsDocumentStart(); bool IsDocumentEnd(); diff --git a/scanscalar.cpp b/scanscalar.cpp index 480480a36e..b3577f51c8 100644 --- a/scanscalar.cpp +++ b/scanscalar.cpp @@ -6,345 +6,13 @@ namespace YAML { - ////////////////////////////////////////////////////////// - // WhitespaceInfo - - WhitespaceInfo::WhitespaceInfo(): leadingBlanks(false), fold(true), chomp(0), increment(0) - { - } - - void WhitespaceInfo::SetChompers(char ch) - { - if(ch == '+') - chomp = 1; - else if(ch == '-') - chomp = -1; - else if(Exp::Digit.Matches(ch)) { - increment = ch - '0'; - if(increment == 0) - throw ZeroIndentationInBlockScalar(); - } - } - - void WhitespaceInfo::AddBlank(char ch) - { - if(!leadingBlanks) - whitespace += ch; - } - - void WhitespaceInfo::AddBreak(const std::string& line) - { - // where to store this character? - if(!leadingBlanks) { - leadingBlanks = true; - whitespace = ""; - leadingBreaks += line; - } else - trailingBreaks += line; - } - - std::string WhitespaceInfo::Join(bool lastLine) - { - std::string ret; - - if(leadingBlanks) { - // fold line break? - if(fold && Exp::Break.Matches(leadingBreaks) && trailingBreaks.empty() && !lastLine) - ret = " "; - else if(!lastLine || chomp != -1) - ret = leadingBreaks; - - if(!lastLine || chomp == 1) - ret += trailingBreaks; - - leadingBlanks = false; - leadingBreaks = ""; - trailingBreaks = ""; - } else if(!whitespace.empty()) { - ret = whitespace; - whitespace = ""; - } - - return ret; - } - - // PlainScalarToken - // . We scan these in passes of two steps each: First, grab all non-whitespace - // characters we can, and then grab all whitespace characters we can. - // . This has the benefit of letting us handle leading whitespace (which is chomped) - // and in-line whitespace (which is kept) separately. - template <> PlainScalarToken *Scanner::ScanToken(PlainScalarToken *pToken) - { - //// now eat and store the scalar - //std::string scalar; - //WhitespaceInfo info; - - //while(INPUT) { - // // doc start/end tokens - // if(IsDocumentStart() || IsDocumentEnd()) - // break; - - // // comment - // if(Exp::Comment.Matches(INPUT)) - // break; - - // // first eat non-blanks - // while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) { - // // illegal colon in flow context - // if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(INPUT)) - // throw IllegalScalar(); - - // // characters that might end the scalar - // if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(INPUT)) - // break; - // if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT)) - // break; - - // // finally, read the character! - // scalar += GetChar(); - // } - - // // did we hit a non-blank character that ended us? - // if(!Exp::BlankOrBreak.Matches(INPUT)) - // break; - - // // now eat blanks - // while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) { - // if(Exp::Blank.Matches(INPUT)) { - // // can't use tabs as indentation! only spaces! - // if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top()) - // throw IllegalTabInScalar(); - - // info.AddBlank(GetChar()); - // } else { - // // we know it's a line break; see how many characters to read - // int n = Exp::Break.Match(INPUT); - // std::string line = GetChar(n); - // info.AddBreak(line); - - // // and we can't continue a simple key to the next line - // ValidateSimpleKey(); - // } - // } - - // // break if we're below the indentation level - // if(m_flowLevel == 0 && m_column <= m_indents.top()) - // break; - - // // finally join whitespace - // scalar += info.Join(); - //} - - ScanScalarInfo info; - info.end = (m_flowLevel > 0 ? Exp::EndScalarInFlow : Exp::EndScalar) || (RegEx(' ') + Exp::Comment); - info.eatEnd = false; - info.indent = (m_flowLevel > 0 ? 0 : m_indents.top() + 1); - info.fold = true; - info.eatLeadingWhitespace = true; - info.trimTrailingSpaces = true; - info.chomp = CLIP; - - // insert a potential simple key - if(m_simpleKeyAllowed) - InsertSimpleKey(); - - pToken->value = ScanScalar(INPUT, info); - - m_simpleKeyAllowed = false; - if(true/*info.leadingBlanks*/) - m_simpleKeyAllowed = true; - - return pToken; - } - - // QuotedScalarToken - template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken) - { - //// now eat and store the scalar - //std::string scalar; - //WhitespaceInfo info; - - //while(INPUT) { - // if(IsDocumentStart() || IsDocumentEnd()) - // throw DocIndicatorInQuote(); - - // if(INPUT.peek() == EOF) - // throw EOFInQuote(); - - // // first eat non-blanks - // while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) { - // // escaped single quote? - // if(pToken->single && Exp::EscSingleQuote.Matches(INPUT)) { - // int n = Exp::EscSingleQuote.Match(INPUT); - // scalar += GetChar(n); - // continue; - // } - - // // is the quote ending? - // if(INPUT.peek() == quote) - // break; - - // // escaped newline? - // if(Exp::EscBreak.Matches(INPUT)) - // break; - - // // other escape sequence - // if(INPUT.peek() == '\\') { - // int length = 0; - // scalar += Exp::Escape(INPUT, length); - // m_column += length; - // continue; - // } - - // // and finally, just add the damn character - // scalar += GetChar(); - // } - - // // is the quote ending? - // if(INPUT.peek() == quote) { - // // eat and go - // GetChar(); - // break; - // } - - // // now we eat blanks - // while(Exp::BlankOrBreak.Matches(INPUT)) { - // if(Exp::Blank.Matches(INPUT)) { - // info.AddBlank(GetChar()); - // } else { - // // we know it's a line break; see how many characters to read - // int n = Exp::Break.Match(INPUT); - // std::string line = GetChar(n); - // info.AddBreak(line); - - // // and we can't continue a simple key to the next line - // ValidateSimpleKey(); - // } - // } - - // // and finally join the whitespace - // scalar += info.Join(); - //} - - // eat single or double quote - char quote = INPUT.GetChar(); - pToken->single = (quote == '\''); - - ScanScalarInfo info; - info.end = (pToken->single ? RegEx(quote) && !Exp::EscSingleQuote : RegEx(quote)); - info.eatEnd = true; - info.escape = (pToken->single ? '\'' : '\\'); - info.indent = 0; - info.fold = true; - info.eatLeadingWhitespace = true; - info.trimTrailingSpaces = false; - info.chomp = CLIP; - - // insert a potential simple key - if(m_simpleKeyAllowed) - InsertSimpleKey(); - - pToken->value = ScanScalar(INPUT, info); - m_simpleKeyAllowed = false; - - return pToken; - } - - // BlockScalarToken - template <> BlockScalarToken *Scanner::ScanToken(BlockScalarToken *pToken) - { - WhitespaceInfo info; - - // eat block indicator ('|' or '>') - char indicator = INPUT.GetChar(); - info.fold = (indicator == Keys::FoldedScalar); - - // eat chomping/indentation indicators - int n = Exp::Chomp.Match(INPUT); - for(int i=0;i= 0) - indent += m_indents.top(); - - GetBlockIndentation(INPUT, indent, info.trailingBreaks, m_indents.top()); - - ScanScalarInfo sinfo; - sinfo.indent = indent; - sinfo.fold = info.fold; - sinfo.eatLeadingWhitespace = false; - sinfo.trimTrailingSpaces = false; - sinfo.chomp = (CHOMP) info.chomp; - - pToken->value = ScanScalar(INPUT, sinfo); - - // simple keys always ok after block scalars (since we're gonna start a new line anyways) - m_simpleKeyAllowed = true; - return pToken; - } - - // GetBlockIndentation - // . Helper to scanning a block scalar. - // . Eats leading *indentation* zeros (i.e., those that come before 'indent'), - // and updates 'indent' (if it hasn't been set yet). - void GetBlockIndentation(Stream& INPUT, int& indent, std::string& breaks, int topIndent) - { - int maxIndent = 0; - - while(1) { - // eat as many indentation spaces as we can - while((indent == 0 || INPUT.column < indent) && INPUT.peek() == ' ') - INPUT.Eat(1); - - if(INPUT.column > maxIndent) - maxIndent = INPUT.column; - - // do we need more indentation, but we've got a tab? - if((indent == 0 || INPUT.column < indent) && INPUT.peek() == '\t') - throw IllegalTabInScalar(); // TODO: are literal scalar lines allowed to have tabs here? - - // is this a non-empty line? - if(!Exp::Break.Matches(INPUT)) - break; - - // otherwise, eat the line break and move on - int n = Exp::Break.Match(INPUT); - breaks += INPUT.GetChar(n); - } - - // finally, set the indentation - if(indent == 0) { - indent = maxIndent; - if(indent < topIndent + 1) - indent = topIndent + 1; - if(indent < 1) - indent = 1; - } - } - // ScanScalar - std::string ScanScalar(Stream& INPUT, ScanScalarInfo info) + std::string ScanScalar(Stream& INPUT, ScanScalarInfo& info) { + bool foundNonEmptyLine = false; bool emptyLine = false, moreIndented = false; std::string scalar; + info.leadingSpaces = false; while(INPUT) { // ******************************** @@ -353,6 +21,16 @@ namespace YAML if(INPUT.peek() == EOF) break; + // document indicator? + if(INPUT.column == 0 && Exp::DocIndicator.Matches(INPUT)) { + if(info.onDocIndicator == BREAK) + break; + else if(info.onDocIndicator == THROW) + throw IllegalDocIndicator(); + } + + foundNonEmptyLine = true; + // escaped newline? (only if we're escaping on slash) if(info.escape == '\\' && Exp::EscBreak.Matches(INPUT)) { int n = Exp::EscBreak.Match(INPUT); @@ -373,10 +51,14 @@ namespace YAML // eof? if we're looking to eat something, then we throw if(INPUT.peek() == EOF) { if(info.eatEnd) - throw EOFInQuote(); + throw IllegalEOF(); break; } + // doc indicator? + if(info.onDocIndicator == BREAK && INPUT.column == 0 && Exp::DocIndicator.Matches(INPUT)) + break; + // are we done via character match? int n = info.end.Match(INPUT); if(n >= 0) { @@ -394,30 +76,44 @@ namespace YAML // Phase #3: scan initial spaces // first the required indentation - while(INPUT.peek() == ' ' && INPUT.column < info.indent) + while(INPUT.peek() == ' ' && (INPUT.column < info.indent || (info.detectIndent && !foundNonEmptyLine))) INPUT.Eat(1); + // update indent if we're auto-detecting + if(info.detectIndent && !foundNonEmptyLine) + info.indent = std::max(info.indent, INPUT.column); + // and then the rest of the whitespace - if(info.eatLeadingWhitespace) { - while(Exp::Blank.Matches(INPUT)) - INPUT.Eat(1); + while(Exp::Blank.Matches(INPUT)) { + // we check for tabs that masquerade as indentation + if(INPUT.peek() == '\t'&& INPUT.column < info.indent && info.onTabInIndentation == THROW) + throw IllegalTabInIndentation(); + + if(!info.eatLeadingWhitespace) + break; + + INPUT.Eat(1); } // was this an empty line? bool nextEmptyLine = Exp::Break.Matches(INPUT); bool nextMoreIndented = (INPUT.peek() == ' '); + // TODO: for block scalars, we always start with a newline, so we should fold OR keep that + if(info.fold && !emptyLine && !nextEmptyLine && !moreIndented && !nextMoreIndented) scalar += " "; else - scalar += "\n"; + scalar += "\n"; emptyLine = nextEmptyLine; moreIndented = nextMoreIndented; // are we done via indentation? - if(!emptyLine && INPUT.column < info.indent) + if(!emptyLine && INPUT.column < info.indent) { + info.leadingSpaces = true; break; + } } // post-processing @@ -437,4 +133,123 @@ namespace YAML return scalar; } + + // PlainScalarToken + template <> PlainScalarToken *Scanner::ScanToken(PlainScalarToken *pToken) + { + // set up the scanning parameters + ScanScalarInfo info; + info.end = (m_flowLevel > 0 ? Exp::EndScalarInFlow : Exp::EndScalar) || (RegEx(' ') + Exp::Comment); + info.eatEnd = false; + info.indent = (m_flowLevel > 0 ? 0 : m_indents.top() + 1); + info.fold = true; + info.eatLeadingWhitespace = true; + info.trimTrailingSpaces = true; + info.chomp = CLIP; + info.onDocIndicator = BREAK; + info.onTabInIndentation = THROW; + + // insert a potential simple key + if(m_simpleKeyAllowed) + InsertSimpleKey(); + + pToken->value = ScanScalar(INPUT, info); + + // can have a simple key only if we ended the scalar by starting a new line + m_simpleKeyAllowed = info.leadingSpaces; + + // finally, we can't have any colons in a scalar, so if we ended on a colon, there + // had better be a break after it + if(Exp::IllegalColonInScalar.Matches(INPUT)) + throw IllegalScalar(); + + return pToken; + } + + // QuotedScalarToken + template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken) + { + // eat single or double quote + char quote = INPUT.GetChar(); + pToken->single = (quote == '\''); + + // setup the scanning parameters + ScanScalarInfo info; + info.end = (pToken->single ? RegEx(quote) && !Exp::EscSingleQuote : RegEx(quote)); + info.eatEnd = true; + info.escape = (pToken->single ? '\'' : '\\'); + info.indent = 0; + info.fold = true; + info.eatLeadingWhitespace = true; + info.trimTrailingSpaces = false; + info.chomp = CLIP; + info.onDocIndicator = THROW; + + // insert a potential simple key + if(m_simpleKeyAllowed) + InsertSimpleKey(); + + pToken->value = ScanScalar(INPUT, info); + m_simpleKeyAllowed = false; + + return pToken; + } + + // BlockScalarToken + // . These need a little extra processing beforehand. + // . We need to scan the line where the indicator is (this doesn't count as part of the scalar), + // and then we need to figure out what level of indentation we'll be using. + template <> BlockScalarToken *Scanner::ScanToken(BlockScalarToken *pToken) + { + ScanScalarInfo info; + info.indent = 1; + info.detectIndent = true; + + // eat block indicator ('|' or '>') + char indicator = INPUT.GetChar(); + info.fold = (indicator == Keys::FoldedScalar); + + // eat chomping/indentation indicators + int n = Exp::Chomp.Match(INPUT); + for(int i=0;i= 0) + info.indent += m_indents.top(); + + info.eatLeadingWhitespace = false; + info.trimTrailingSpaces = false; + info.onTabInIndentation = THROW; + + pToken->value = ScanScalar(INPUT, info); + + // simple keys always ok after block scalars (since we're gonna start a new line anyways) + m_simpleKeyAllowed = true; + return pToken; + } } diff --git a/scanscalar.h b/scanscalar.h index f4987a1f73..c3d0030f29 100644 --- a/scanscalar.h +++ b/scanscalar.h @@ -7,35 +7,29 @@ namespace YAML { enum CHOMP { STRIP = -1, CLIP, KEEP }; + enum ACTION { NONE, BREAK, THROW }; struct ScanScalarInfo { - ScanScalarInfo(): eatEnd(false), indent(0), eatLeadingWhitespace(0), escape(0), fold(false), trimTrailingSpaces(0), chomp(CLIP) {} + ScanScalarInfo(): eatEnd(false), indent(0), detectIndent(false), eatLeadingWhitespace(0), escape(0), fold(false), + trimTrailingSpaces(0), chomp(CLIP), onDocIndicator(NONE), onTabInIndentation(NONE), leadingSpaces(false) {} + // input: RegEx end; // what condition ends this scalar? bool eatEnd; // should we eat that condition when we see it? int indent; // what level of indentation should be eaten and ignored? + bool detectIndent; // should we try to autodetect the indent? bool eatLeadingWhitespace; // should we continue eating this delicious indentation after 'indent' spaces? char escape; // what character do we escape on (i.e., slash or single quote) (0 for none) bool fold; // do we fold line ends? bool trimTrailingSpaces; // do we remove all trailing spaces (at the very end) CHOMP chomp; // do we strip, clip, or keep trailing newlines (at the very end) // Note: strip means kill all, clip means keep at most one, keep means keep all + ACTION onDocIndicator; // what do we do if we see a document indicator? + ACTION onTabInIndentation; // what do we do if we see a tab where we should be seeing indentation spaces + + // output: + bool leadingSpaces; }; - void GetBlockIndentation(Stream& INPUT, int& indent, std::string& breaks, int topIndent); - std::string ScanScalar(Stream& INPUT, ScanScalarInfo info); - - struct WhitespaceInfo { - WhitespaceInfo(); - - void SetChompers(char ch); - void AddBlank(char ch); - void AddBreak(const std::string& line); - std::string Join(bool lastline = false); - - bool leadingBlanks; - bool fold; - std::string whitespace, leadingBreaks, trailingBreaks; - int chomp, increment; - }; + std::string ScanScalar(Stream& INPUT, ScanScalarInfo& info); } diff --git a/test.yaml b/test.yaml index a7221f013d..ef3aeb2b37 100644 --- a/test.yaml +++ b/test.yaml @@ -1,14 +1,17 @@ --- -- "quoted scalar\twith a tab\nand a newline" -- 'This is Jesse''s single quote!' -- | - here's a literal: - #include - - int main() - { - std::cout << "Hello World!\n"; - return 0; - } -- key1: value1 - key2: value2 \ No newline at end of file +- here's a key: value + here's the first block: | + after the block: value + and here's a block: |- + What's going on? + How are you doing? + Here's some code: + + #include + int main() + { + std::cout << "Hello World!\n"; + } + + I'm doing fine! + and last key: value \ No newline at end of file