From 4e435b1321f09674ab3345ba8e99bc53f20f9b68 Mon Sep 17 00:00:00 2001 From: Jesse Beder Date: Fri, 27 Jun 2008 08:20:41 +0000 Subject: [PATCH] Wrote a simplified regular expression parser to make life easier (it only does single matches; i.e., no one-or-more matches, etc.). Fixed some of the whitespace/line break matching. --- main.cpp | 17 ++++++ regex.cpp | 114 ++++++++++++++++++++++++++++++++++++++ regex.h | 37 +++++++++++++ scanner.cpp | 133 ++++++++++++++++++++++----------------------- scanner.h | 37 +++++++++++-- test.yaml | 6 +- yaml-reader.vcproj | 8 +++ 7 files changed, 277 insertions(+), 75 deletions(-) create mode 100644 regex.cpp create mode 100644 regex.h diff --git a/main.cpp b/main.cpp index f5f3fa89a4..6ff8af6424 100644 --- a/main.cpp +++ b/main.cpp @@ -1,7 +1,24 @@ #include "document.h" +#include "regex.h" int main() { + YAML::RegEx alpha = YAML::RegEx('a', 'z') || YAML::RegEx('A', 'Z'); + alpha.Matches("a"); + alpha.Matches("d"); + alpha.Matches("F"); + alpha.Matches("0"); + alpha.Matches("5"); + alpha.Matches(" "); + + YAML::RegEx blank = YAML::RegEx(' ') || YAML::RegEx('\t'); + YAML::RegEx docstart = YAML::RegEx("---") + (blank || YAML::RegEx(EOF) || YAML::RegEx()); + docstart.Matches("--- "); + docstart.Matches("... "); + docstart.Matches("----"); + docstart.Matches("---\t"); + docstart.Matches("---"); + YAML::Document doc("test.yaml"); return 0; diff --git a/regex.cpp b/regex.cpp new file mode 100644 index 0000000000..a2907f513f --- /dev/null +++ b/regex.cpp @@ -0,0 +1,114 @@ +#include "regex.h" + +namespace YAML +{ + RegEx::RegEx(REGEX_OP op): m_op(op) + { + } + + RegEx::RegEx(): m_op(REGEX_EMPTY) + { + } + + RegEx::RegEx(char ch): m_op(REGEX_MATCH), m_a(ch) + { + } + + RegEx::RegEx(char a, char z): m_op(REGEX_RANGE), m_a(a), m_z(z) + { + } + + RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op) + { + for(unsigned i=0;i= 0; + } + + // Match + // . Matches the given string against this regular expression. + // . Returns the number of characters matched. + // . Returns -1 if no characters were matched (the reason for + // not returning zero is that we may have an empty regex + // which SHOULD be considered successfully matching nothing, + // but that of course matches zero characters). + int RegEx::Match(const std::string& str) const + { + switch(m_op) { + case REGEX_EMPTY: + if(str.empty()) + return 0; + return -1; + case REGEX_MATCH: + if(str.empty() || str[0] != m_a) + return -1; + return 1; + case REGEX_RANGE: + if(str.empty() || m_a > str[0] || m_z < str[0]) + return -1; + return 1; + case REGEX_NOT: + if(m_params.empty()) + return false; + if(m_params[0].Match(str) >= 0) + return -1; + return 1; + case REGEX_OR: + for(unsigned i=0;i= 0) + return n; + } + return -1; + case REGEX_SEQ: + int offset = 0; + for(unsigned i=0;i +#include + +namespace YAML +{ + enum REGEX_OP { REGEX_EMPTY, REGEX_MATCH, REGEX_RANGE, REGEX_OR, REGEX_NOT, REGEX_SEQ }; + + // simplified regular expressions + // . Only straightforward matches (no repeated characters) + // . Only matches from start of string + class RegEx { + public: + RegEx(); + RegEx(char ch); + RegEx(char a, char z); + RegEx(const std::string& str, REGEX_OP op = REGEX_SEQ); + ~RegEx(); + + bool Matches(char ch) const; + bool Matches(const std::string& str) const; + int Match(const std::string& str) const; + + friend RegEx operator ! (const RegEx& ex); + friend RegEx operator || (const RegEx& ex1, const RegEx& ex2); + friend RegEx operator + (const RegEx& ex1, const RegEx& ex2); + + private: + RegEx(REGEX_OP op); + + private: + REGEX_OP m_op; + char m_a, m_z; + std::vector m_params; + }; +} diff --git a/scanner.cpp b/scanner.cpp index 6fed538d7d..e1495f5baa 100644 --- a/scanner.cpp +++ b/scanner.cpp @@ -30,7 +30,10 @@ namespace YAML char Scanner::GetChar() { m_column++; - return INPUT.get(); + char ch = INPUT.get(); + if(ch == '\n') + m_column = 0; + return ch; } // Eat @@ -87,18 +90,6 @@ namespace YAML return false; } - // IsLineBreak - bool Scanner::IsLineBreak(char ch) - { - return ch == '\n'; // TODO: More types of line breaks - } - - // IsBlank - bool Scanner::IsBlank(char ch) - { - return IsLineBreak(ch) || ch == ' ' || ch == '\t' || ch == EOF; - } - // IsDocumentStart bool Scanner::IsDocumentStart() { @@ -106,8 +97,7 @@ namespace YAML if(m_column != 0) return false; - std::string next = Peek(4); - return next[0] == '-' && next[1] == '-' && next[2] == '-' && IsBlank(next[3]); + return Exp::DocStart.Matches(Peek(4)); } // IsDocumentEnd @@ -117,61 +107,41 @@ namespace YAML if(m_column != 0) return false; - std::string next = Peek(4); - return next[0] == '.' && next[1] == '.' && next[2] == '.' && IsBlank(next[3]); + return Exp::DocEnd.Matches(Peek(4)); } // IsBlockEntry bool Scanner::IsBlockEntry() { - std::string next = Peek(2); - return next[0] == Keys::BlockEntry && IsBlank(next[1]); + return Exp::BlockEntry.Matches(Peek(2)); } // IsKey bool Scanner::IsKey() { std::string next = Peek(2); - return next[0] == Keys::Key && (IsBlank(next[1]) || m_flowLevel > 0); + if(m_flowLevel > 0) + return Exp::KeyInFlow.Matches(next); + return Exp::Key.Matches(next); } // IsValue bool Scanner::IsValue() { std::string next = Peek(2); - return next[0] == Keys::Value && (IsBlank(next[1]) || m_flowLevel > 0); + if(m_flowLevel > 0) + return Exp::ValueInFlow.Matches(next); + return Exp::Value.Matches(next); } // IsPlainScalar // . Rules: - // . Cannot start with a blank. - // . Can never start with any of , [ ] { } # & * ! | > \' \" % @ ` - // . In the block context - ? : must be not be followed with a space. - // . In the flow context ? : are illegal and - must not be followed with a space. bool Scanner::IsPlainScalar() { std::string next = Peek(2); - - if(IsBlank(next[0])) - return false; - - // never characters - if(std::string(",[]{}#&*!|>\'\"%@`").find(next[0]) != std::string::npos) - return false; - - // specific block/flow characters - if(m_flowLevel == 0) { - if((next[0] == '-' || next[0] == '?' || next[0] == ':') && IsBlank(next[1])) - return false; - } else { - if(next[0] == '?' || next[0] == ':') - return false; - - if(next[0] == '-' && IsBlank(next[1])) - return false; - } - - return true; + if(m_flowLevel > 0) + return Exp::PlainScalarInFlow.Matches(next); + return Exp::PlainScalar.Matches(next); } /////////////////////////////////////////////////////////////////////// @@ -233,7 +203,7 @@ namespace YAML // DocumentEndToken template <> DocumentEndToken *Scanner::ScanToken(DocumentEndToken *pToken) { - PopIndentTo(m_column); + PopIndentTo(-1); // TODO: "reset simple keys" m_simpleKeyAllowed = false; @@ -389,8 +359,8 @@ namespace YAML m_simpleKeyAllowed = false; // now eat and store the scalar - std::string scalar; - bool leadingBlanks = true; + std::string scalar, whitespace, leadingBreaks, trailingBreaks; + bool leadingBlanks = false; while(INPUT) { // doc start/end tokens @@ -398,43 +368,72 @@ namespace YAML break; // comment - if(INPUT.peek() == Keys::Comment) + if(Exp::Comment.Matches(INPUT.peek())) break; // first eat non-blanks - while(INPUT && !IsBlank(INPUT.peek())) { + while(INPUT && !Exp::BlankOrBreak.Matches(INPUT.peek())) { std::string next = Peek(2); // illegal colon in flow context - if(m_flowLevel > 0 && next[0] == ':') { - if(!IsBlank(next[1])) - throw IllegalScalar(); - } + if(m_flowLevel > 0 && Exp::IllegalColonInScalar.Matches(next)) + throw IllegalScalar(); // characters that might end the scalar - if(next[0] == ':' && IsBlank(next[1])) + if(m_flowLevel > 0 && Exp::EndScalarInFlow.Matches(next)) break; - if(m_flowLevel > 0 && std::string(",:?[]{}").find(next[0]) != std::string::npos) + if(m_flowLevel == 0 && Exp::EndScalar.Matches(next)) break; + if(leadingBlanks) { + if(!leadingBreaks.empty() && leadingBreaks[0] == '\n') { + // fold line break? + if(trailingBreaks.empty()) + scalar += ' '; + else { + scalar += trailingBreaks; + trailingBreaks = ""; + } + } else { + scalar += leadingBreaks + trailingBreaks; + leadingBreaks = ""; + trailingBreaks = ""; + } + } else if(!whitespace.empty()) { + scalar += whitespace; + whitespace = ""; + } + + // finally, read the character! scalar += GetChar(); } + // did we hit a non-blank character that ended us? + if(!Exp::BlankOrBreak.Matches(INPUT.peek())) + break; + // now eat blanks - while(INPUT && (IsBlank(INPUT.peek()) /* || IsBreak(INPUT.peek()) */)) { - if(IsBlank(INPUT.peek())) { + while(INPUT && Exp::BlankOrBreak.Matches(INPUT.peek())) { + if(Exp::Blank.Matches(INPUT.peek())) { if(leadingBlanks && m_column <= m_indents.top()) throw IllegalTabInScalar(); - // TODO: Store some blanks? - Eat(1); + // maybe store this character + if(!leadingBlanks) + whitespace += GetChar(); + else + Eat(1); } else { - Eat(1); + // where to store this character? + if(!leadingBlanks) { + leadingBlanks = true; + whitespace = ""; + leadingBreaks += GetChar(); + } else + trailingBreaks += GetChar(); } } - // TODO: join whitespace - // and finally break if we're below the indentation level if(m_flowLevel == 0 && m_column <= m_indents.top()) break; @@ -532,14 +531,14 @@ namespace YAML Eat(1); // then eat a comment - if(INPUT.peek() == Keys::Comment) { + if(Exp::Comment.Matches(INPUT.peek())) { // eat until line break - while(INPUT && !IsLineBreak(INPUT.peek())) + while(INPUT && !Exp::Break.Matches(INPUT.peek())) Eat(1); } // if it's NOT a line break, then we're done! - if(!IsLineBreak(INPUT.peek())) + if(!Exp::Break.Matches(INPUT.peek())) break; // otherwise, let's eat the line break and keep going diff --git a/scanner.h b/scanner.h index 83f2583c9f..94cd88ab0e 100644 --- a/scanner.h +++ b/scanner.h @@ -5,22 +5,49 @@ #include #include #include +#include "regex.h" namespace YAML { class Token; + namespace Exp + { + // misc + const RegEx Blank = RegEx(' ') || RegEx('\t'); + const RegEx Break = RegEx('\n'); + const RegEx BlankOrBreak = Blank || Break; + + // actual tags + + const RegEx DocStart = RegEx("---") + (BlankOrBreak || RegEx(EOF) || RegEx()); + const RegEx DocEnd = RegEx("...") + (BlankOrBreak || RegEx(EOF) || RegEx()); + const RegEx BlockEntry = RegEx('-') + (BlankOrBreak || RegEx(EOF)); + const RegEx Key = RegEx('?'), + KeyInFlow = RegEx('?') + BlankOrBreak; + const RegEx Value = RegEx(':'), + ValueInFlow = RegEx(':') + BlankOrBreak; + const RegEx Comment = RegEx('#'); + + // Plain scalar rules: + // . Cannot start with a blank. + // . Can never start with any of , [ ] { } # & * ! | > \' \" % @ ` + // . In the block context - ? : must be not be followed with a space. + // . In the flow context ? : are illegal and - must not be followed with a space. + const RegEx PlainScalar = !(BlankOrBreak || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:") + Blank)), + PlainScalarInFlow = !(BlankOrBreak || RegEx("?:,[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx('-') + Blank)); + const RegEx IllegalColonInScalar = RegEx(':') + !BlankOrBreak; + const RegEx EndScalar = RegEx(':') + BlankOrBreak, + EndScalarInFlow = (RegEx(':') + BlankOrBreak) || RegEx(",:?[]{}"); + } + namespace Keys { - const char Comment = '#'; const char FlowSeqStart = '['; const char FlowSeqEnd = ']'; const char FlowMapStart = '{'; const char FlowMapEnd = '}'; const char FlowEntry = ','; - const char BlockEntry = '-'; - const char Key = '?'; - const char Value = ':'; const char Alias = '*'; const char Anchor = '&'; const char Tag = '!'; @@ -49,8 +76,6 @@ namespace YAML void EatLineBreak(); bool IsWhitespaceToBeEaten(char ch); - bool IsLineBreak(char ch); - bool IsBlank(char ch); bool IsDocumentStart(); bool IsDocumentEnd(); bool IsBlockEntry(); diff --git a/test.yaml b/test.yaml index 0581ec1257..d3ad2c7190 100644 --- a/test.yaml +++ b/test.yaml @@ -1,3 +1,5 @@ +--- - milk -- eggs -- cheese and bread # this is really important! +- eggs # this is really important! +- cheese and bread +... \ No newline at end of file diff --git a/yaml-reader.vcproj b/yaml-reader.vcproj index 04a440f9ab..e0c32de3f9 100644 --- a/yaml-reader.vcproj +++ b/yaml-reader.vcproj @@ -185,6 +185,10 @@ RelativePath=".\parser.cpp" > + + @@ -227,6 +231,10 @@ RelativePath=".\parser.h" > + +