Refactored common scalar scanning code (from plain, quoted, and block) to one function.

This commit is contained in:
Jesse Beder 2008-06-29 05:45:41 +00:00
parent 6c193d6fbd
commit 0d5a97bffe
6 changed files with 306 additions and 188 deletions

10
exp.cpp
View file

@ -62,7 +62,7 @@ namespace YAML
}
// Escape
// . Escapes the sequence starting 'in' (it must begin with a '\')
// . Escapes the sequence starting 'in' (it must begin with a '\' or single quote)
// and returns the result.
// . Fills 'length' with how many characters we ate.
// . Throws if it's an unknown escape character.
@ -72,10 +72,16 @@ namespace YAML
length = 2;
// eat slash
in.get();
char escape = in.get();
// switch on escape character
char ch = in.get();
// first do single quote, since it's easier
if(escape == '\'' && ch == '\'')
return "\'";
// now do the slash (we're not gonna check if it's a slash - you better pass one!)
switch(ch) {
case '0': return "\0";
case 'a': return "\x07";

View file

@ -53,6 +53,7 @@ namespace YAML
case REGEX_MATCH: m_pOp = new MatchOperator; break;
case REGEX_RANGE: m_pOp = new RangeOperator; break;
case REGEX_OR: m_pOp = new OrOperator; break;
case REGEX_AND: m_pOp = new AndOperator; break;
case REGEX_NOT: m_pOp = new NotOperator; break;
case REGEX_SEQ: m_pOp = new SeqOperator; break;
}
@ -80,19 +81,13 @@ namespace YAML
// . Returns the number of characters matched.
// . Returns -1 if no characters were matched (the reason for
// not returning zero is that we may have an empty regex
// which SHOULD be considered successfully matching nothing,
// but that of course matches zero characters).
// which is ALWAYS successful at matching zero characters).
int RegEx::Match(const std::string& str) const
{
if(!m_pOp)
return -1;
return 0;
return m_pOp->Match(str, *this);
//case REGEX_EMPTY:
// if(str.empty())
// return 0;
// return -1;
}
// Match
@ -131,6 +126,14 @@ namespace YAML
return ret;
}
RegEx operator && (const RegEx& ex1, const RegEx& ex2)
{
RegEx ret(REGEX_AND);
ret.m_params.push_back(ex1);
ret.m_params.push_back(ex2);
return ret;
}
RegEx operator + (const RegEx& ex1, const RegEx& ex2)
{
RegEx ret(REGEX_SEQ);
@ -194,6 +197,36 @@ namespace YAML
return -1;
}
// AndOperator
// Note: 'AND' is a little funny, since we may be required to match things
// of different lengths. If we find a match, we return the length of
// the FIRST entry on the list.
int RegEx::AndOperator::Match(const std::string& str, const RegEx& regex) const
{
int first = -1;
for(unsigned i=0;i<regex.m_params.size();i++) {
int n = regex.m_params[i].Match(str);
if(n == -1)
return -1;
if(i == 0)
first = n;
}
return first;
}
int RegEx::AndOperator::Match(std::istream& in, const RegEx& regex) const
{
int first = -1;
for(unsigned i=0;i<regex.m_params.size();i++) {
int n = regex.m_params[i].Match(in);
if(n == -1)
return -1;
if(i == 0)
first = n;
}
return first;
}
// NotOperator
int RegEx::NotOperator::Match(const std::string& str, const RegEx& regex) const
{

View file

@ -6,7 +6,7 @@
namespace YAML
{
enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_NOT, REGEX_SEQ };
enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_AND, REGEX_NOT, REGEX_SEQ };
// simplified regular expressions
// . Only straightforward matches (no repeated characters)
@ -35,6 +35,11 @@ namespace YAML
virtual int Match(std::istream& in, const RegEx& regex) const;
};
struct AndOperator: public Operator {
virtual int Match(const std::string& str, const RegEx& regex) const;
virtual int Match(std::istream& in, const RegEx& regex) const;
};
struct NotOperator: public Operator {
virtual int Match(const std::string& str, const RegEx& regex) const;
virtual int Match(std::istream& in, const RegEx& regex) const;
@ -63,6 +68,7 @@ namespace YAML
friend RegEx operator ! (const RegEx& ex);
friend RegEx operator || (const RegEx& ex1, const RegEx& ex2);
friend RegEx operator && (const RegEx& ex1, const RegEx& ex2);
friend RegEx operator + (const RegEx& ex1, const RegEx& ex2);
private:

View file

@ -5,6 +5,7 @@
#include <queue>
#include <stack>
#include <set>
#include "regex.h"
namespace YAML
{
@ -44,6 +45,7 @@ namespace YAML
bool IsPlainScalar();
void GetBlockIndentation(int& indent, std::string& breaks);
std::string ScanScalar(RegEx end, bool eatEnd, int indent, char escape, bool fold, bool eatLeadingWhitespace, bool trimTrailingSpaces, int chomp);
struct SimpleKey {
SimpleKey(int pos_, int line_, int column_, int flowLevel_);

View file

@ -75,74 +75,77 @@ namespace YAML
// and in-line whitespace (which is kept) separately.
template <> PlainScalarToken *Scanner::ScanToken(PlainScalarToken *pToken)
{
//// now eat and store the scalar
//std::string scalar;
//WhitespaceInfo info;
//while(INPUT) {
// // doc start/end tokens
// if(IsDocumentStart() || IsDocumentEnd())
// break;
// // comment
// if(Exp::Comment.Matches(INPUT))
// break;
// // first eat non-blanks
// while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
// // illegal colon in flow context
// if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(INPUT))
// throw IllegalScalar();
// // characters that might end the scalar
// if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(INPUT))
// break;
// if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT))
// break;
// // finally, read the character!
// scalar += GetChar();
// }
// // did we hit a non-blank character that ended us?
// if(!Exp::BlankOrBreak.Matches(INPUT))
// break;
// // now eat blanks
// while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) {
// if(Exp::Blank.Matches(INPUT)) {
// // can't use tabs as indentation! only spaces!
// if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top())
// throw IllegalTabInScalar();
// info.AddBlank(GetChar());
// } else {
// // we know it's a line break; see how many characters to read
// int n = Exp::Break.Match(INPUT);
// std::string line = GetChar(n);
// info.AddBreak(line);
// // and we can't continue a simple key to the next line
// ValidateSimpleKey();
// }
// }
// // break if we're below the indentation level
// if(m_flowLevel == 0 && m_column <= m_indents.top())
// break;
// // finally join whitespace
// scalar += info.Join();
//}
RegEx end = (m_flowLevel > 0 ? Exp::EndScalarInFlow : Exp::EndScalar) || (RegEx(' ') + Exp::Comment);
int indent = (m_flowLevel > 0 ? 0 : m_indents.top() + 1);
// insert a potential simple key
if(m_simpleKeyAllowed)
InsertSimpleKey();
pToken->value = ScanScalar(end, false, indent, 0, true, true, true, 0);
m_simpleKeyAllowed = false;
// now eat and store the scalar
std::string scalar;
WhitespaceInfo info;
while(INPUT) {
// doc start/end tokens
if(IsDocumentStart() || IsDocumentEnd())
break;
// comment
if(Exp::Comment.Matches(INPUT))
break;
// first eat non-blanks
while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
// illegal colon in flow context
if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(INPUT))
throw IllegalScalar();
// characters that might end the scalar
if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(INPUT))
break;
if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT))
break;
// finally, read the character!
scalar += GetChar();
}
// did we hit a non-blank character that ended us?
if(!Exp::BlankOrBreak.Matches(INPUT))
break;
// now eat blanks
while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) {
if(Exp::Blank.Matches(INPUT)) {
// can't use tabs as indentation! only spaces!
if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top())
throw IllegalTabInScalar();
info.AddBlank(GetChar());
} else {
// we know it's a line break; see how many characters to read
int n = Exp::Break.Match(INPUT);
std::string line = GetChar(n);
info.AddBreak(line);
// and we can't continue a simple key to the next line
ValidateSimpleKey();
}
}
// break if we're below the indentation level
if(m_flowLevel == 0 && m_column <= m_indents.top())
break;
// finally join whitespace
scalar += info.Join();
}
// now modify our token
pToken->value = scalar;
if(info.leadingBlanks)
if(true/*info.leadingBlanks*/)
m_simpleKeyAllowed = true;
return pToken;
@ -151,91 +154,92 @@ namespace YAML
// QuotedScalarToken
template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken)
{
// insert a potential simple key
if(m_simpleKeyAllowed)
InsertSimpleKey();
m_simpleKeyAllowed = false;
//// now eat and store the scalar
//std::string scalar;
//WhitespaceInfo info;
//while(INPUT) {
// if(IsDocumentStart() || IsDocumentEnd())
// throw DocIndicatorInQuote();
// if(INPUT.peek() == EOF)
// throw EOFInQuote();
// // first eat non-blanks
// while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
// // escaped single quote?
// if(pToken->single && Exp::EscSingleQuote.Matches(INPUT)) {
// int n = Exp::EscSingleQuote.Match(INPUT);
// scalar += GetChar(n);
// continue;
// }
// // is the quote ending?
// if(INPUT.peek() == quote)
// break;
// // escaped newline?
// if(Exp::EscBreak.Matches(INPUT))
// break;
// // other escape sequence
// if(INPUT.peek() == '\\') {
// int length = 0;
// scalar += Exp::Escape(INPUT, length);
// m_column += length;
// continue;
// }
// // and finally, just add the damn character
// scalar += GetChar();
// }
// // is the quote ending?
// if(INPUT.peek() == quote) {
// // eat and go
// GetChar();
// break;
// }
// // now we eat blanks
// while(Exp::BlankOrBreak.Matches(INPUT)) {
// if(Exp::Blank.Matches(INPUT)) {
// info.AddBlank(GetChar());
// } else {
// // we know it's a line break; see how many characters to read
// int n = Exp::Break.Match(INPUT);
// std::string line = GetChar(n);
// info.AddBreak(line);
// // and we can't continue a simple key to the next line
// ValidateSimpleKey();
// }
// }
// // and finally join the whitespace
// scalar += info.Join();
//}
// eat single or double quote
char quote = GetChar();
pToken->single = (quote == '\'');
// now eat and store the scalar
std::string scalar;
WhitespaceInfo info;
RegEx end = (pToken->single ? RegEx(quote) && !Exp::EscSingleQuote : RegEx(quote));
char escape = (pToken->single ? '\'' : '\\');
while(INPUT) {
if(IsDocumentStart() || IsDocumentEnd())
throw DocIndicatorInQuote();
// insert a potential simple key
if(m_simpleKeyAllowed)
InsertSimpleKey();
if(INPUT.peek() == EOF)
throw EOFInQuote();
pToken->value = ScanScalar(end, true, 0, escape, true, true, false, 0);
m_simpleKeyAllowed = false;
// first eat non-blanks
while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) {
// escaped single quote?
if(pToken->single && Exp::EscSingleQuote.Matches(INPUT)) {
int n = Exp::EscSingleQuote.Match(INPUT);
scalar += GetChar(n);
continue;
}
// is the quote ending?
if(INPUT.peek() == quote)
break;
// escaped newline?
if(Exp::EscBreak.Matches(INPUT))
break;
// other escape sequence
if(INPUT.peek() == '\\') {
int length = 0;
scalar += Exp::Escape(INPUT, length);
m_column += length;
continue;
}
// and finally, just add the damn character
scalar += GetChar();
}
// is the quote ending?
if(INPUT.peek() == quote) {
// eat and go
GetChar();
break;
}
// now we eat blanks
while(Exp::BlankOrBreak.Matches(INPUT)) {
if(Exp::Blank.Matches(INPUT)) {
info.AddBlank(GetChar());
} else {
// we know it's a line break; see how many characters to read
int n = Exp::Break.Match(INPUT);
std::string line = GetChar(n);
info.AddBreak(line);
// and we can't continue a simple key to the next line
ValidateSimpleKey();
}
}
// and finally join the whitespace
scalar += info.Join();
}
pToken->value = scalar;
return pToken;
}
// BlockScalarToken
template <> BlockScalarToken *Scanner::ScanToken(BlockScalarToken *pToken)
{
// simple keys always ok after block scalars (since we're gonna start a new line anyways)
m_simpleKeyAllowed = true;
WhitespaceInfo info;
// eat block indicator ('|' or '>')
@ -268,37 +272,13 @@ namespace YAML
if(info.increment && m_indents.top() >= 0)
indent += m_indents.top();
// finally, grab that scalar
std::string scalar;
while(INPUT) {
// initialize indentation
GetBlockIndentation(indent, info.trailingBreaks);
// are we done with this guy (i.e. at a lower indentation?)
if(m_column != indent)
break;
bool trailingBlank = Exp::Blank.Matches(INPUT);
scalar += info.Join();
bool leadingBlank = Exp::Blank.Matches(INPUT);
// now eat and save the line
while(INPUT.peek() != EOF && !Exp::Break.Matches(INPUT))
scalar += GetChar();
// we know it's a line break; see how many characters to read
int n = Exp::Break.Match(INPUT);
std::string line = GetChar(n);
info.AddBreak(line);
}
// one last whitespace join (with chompers this time)
scalar += info.Join(true);
// finally set the scalar
pToken->value = scalar;
bool eatLeadingWhitespace = false;
pToken->value = ScanScalar(RegEx(), false, indent, 0, info.fold, eatLeadingWhitespace, false, info.chomp);
// simple keys always ok after block scalars (since we're gonna start a new line anyways)
m_simpleKeyAllowed = true;
return pToken;
}
@ -340,4 +320,104 @@ namespace YAML
indent = 1;
}
}
// ScanScalar
std::string Scanner::ScanScalar(RegEx end, bool eatEnd, int indent, char escape, bool fold, bool eatLeadingWhitespace, bool trimTrailingSpaces, int chomp)
{
bool emptyLine = false, moreIndented = false;
std::string scalar;
while(INPUT) {
// ********************************
// Phase #1: scan until line ending
while(!end.Matches(INPUT) && !Exp::Break.Matches(INPUT)) {
if(INPUT.peek() == EOF)
break;
// escaped newline? (only if we're escaping on slash)
if(escape == '\\' && Exp::EscBreak.Matches(INPUT)) {
int n = Exp::EscBreak.Match(INPUT);
Eat(n);
continue;
}
// escape this?
if(INPUT.peek() == escape) {
int length = 0;
scalar += Exp::Escape(INPUT, length);
m_column += length;
continue;
}
// otherwise, just add the damn character
scalar += GetChar();
}
// eof? if we're looking to eat something, then we throw
if(INPUT.peek() == EOF) {
if(eatEnd)
throw EOFInQuote();
break;
}
// are we done via character match?
int n = end.Match(INPUT);
if(n >= 0) {
if(eatEnd)
Eat(n);
break;
}
// ********************************
// Phase #2: eat line ending
n = Exp::Break.Match(INPUT);
Eat(n);
// ********************************
// Phase #3: scan initial spaces
// first the required indentation
while(INPUT.peek() == ' ' && m_column < indent)
Eat(1);
// and then the rest of the whitespace
if(eatLeadingWhitespace) {
while(Exp::Blank.Matches(INPUT))
Eat(1);
}
// was this an empty line?
bool nextEmptyLine = Exp::Break.Matches(INPUT);
bool nextMoreIndented = (INPUT.peek() == ' ');
if(fold && !emptyLine && !nextEmptyLine && !moreIndented && !nextMoreIndented)
scalar += " ";
else
scalar += "\n";
emptyLine = nextEmptyLine;
moreIndented = nextMoreIndented;
// are we done via indentation?
if(!emptyLine && m_column < indent)
break;
}
// post-processing
if(trimTrailingSpaces) {
unsigned pos = scalar.find_last_not_of(' ');
if(pos < scalar.size())
scalar.erase(pos + 1);
}
if(chomp <= 0) {
unsigned pos = scalar.find_last_not_of('\n');
if(chomp == 0 && pos + 1 < scalar.size())
scalar.erase(pos + 2);
else if(chomp == -1 && pos < scalar.size())
scalar.erase(pos + 1);
}
return scalar;
}
}

View file

@ -1,13 +1,4 @@
people:
- &jsb
name: Jesse
age: 23
- &dab
name: 'Daniel'
age: 25
- &ncb
name: "Naftali"
age: 21
students:
- *jsb
- *ncb
---
- "quoted scalar that contains
---
the document start!"