From 49a75b2d7802fda2cb4ac5aeea09706d24f65afc Mon Sep 17 00:00:00 2001 From: Jesse Beder Date: Fri, 27 Jun 2008 23:11:46 +0000 Subject: [PATCH] Added quoted scalars (with escaping). Refactored some common whitespace-parsing code in scanning both scalars. Implemented the flow collection tokens. --- exceptions.h | 17 +++++ exp.cpp | 106 ++++++++++++++++++++++++++ exp.h | 14 +++- regex.cpp | 2 +- scanner.cpp | 16 ++++ scanner.h | 12 +++ scantoken.cpp | 180 ++++++++++++++++++++++++++++++++++----------- test.yaml | 10 +-- yaml-reader.vcproj | 4 + 9 files changed, 307 insertions(+), 54 deletions(-) create mode 100644 exp.cpp diff --git a/exceptions.h b/exceptions.h index 2b4e689ebf..8af9dbd2c1 100644 --- a/exceptions.h +++ b/exceptions.h @@ -12,4 +12,21 @@ namespace YAML class IllegalMapValue: public Exception {}; class IllegalScalar: public Exception {}; class IllegalTabInScalar: public Exception {}; + class DocIndicatorInQuote: public Exception {}; + class EOFInQuote: public Exception {}; + class UnknownEscapeSequence: public Exception { + public: + UnknownEscapeSequence(char ch_): ch(ch_) {} + char ch; + }; + class NonHexNumber: public Exception { + public: + NonHexNumber(char ch_): ch(ch_) {} + char ch; + }; + class InvalidUnicode: public Exception { + public: + InvalidUnicode(unsigned value_): value(value_) {} + unsigned value; + }; } diff --git a/exp.cpp b/exp.cpp new file mode 100644 index 0000000000..21b00660d2 --- /dev/null +++ b/exp.cpp @@ -0,0 +1,106 @@ +#include "exp.h" +#include "exceptions.h" + +namespace YAML +{ + namespace Exp + { + unsigned ParseHex(std::string str) + { + unsigned value = 0; + for(unsigned i=0;i= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) + throw InvalidUnicode(value); + + // now break it up into chars + if(value <= 0x7F) + return Str(value); + else if(value <= 0x7FF) + return Str(0xC0 + (value >> 6)) + Str(0x80 + (value & 0x3F)); + else if(value <= 0xFFFF) + return Str(0xE0 + (value >> 12)) + Str(0x80 + ((value >> 6) & 0x3F)) + Str(0x80 + (value & 0x3F)); + else + return Str(0xF0 + (value >> 18)) + Str(0x80 + ((value >> 12) & 0x3F)) + + Str(0x80 + ((value >> 6) & 0x3F)) + Str(0x80 + (value & 0x3F)); + } + + // Escape + // . Escapes the sequence starting 'in' (it must begin with a '\') + // and returns the result. + // . Fills 'length' with how many characters we ate. + // . Throws if it's an unknown escape character. + std::string Escape(std::istream& in, int& length) + { + // slash + character + length = 2; + + // eat slash + in.get(); + + // switch on escape character + char ch = in.get(); + switch(ch) { + case '0': return "\0"; + case 'a': return "\x07"; + case 'b': return "\x08"; + case 't': + case '\t': return "\x09"; + case 'n': return "\x0A"; + case 'v': return "\x0B"; + case 'f': return "\x0C"; + case 'r': return "\x0D"; + case 'e': return "\x1B"; + case ' ': return "\x20"; + case '\"': return "\""; + case '\'': return "\'"; + case '\\': return "\\"; + case 'N': return "\xC2\x85"; // NEL (#x85) + case '_': return "\xC2\xA0"; // #xA0 + case 'L': return "\xE2\x80\xA8"; // LS (#x2028) + case 'P': return "\xE2\x80\xA9"; // PS (#x2029) + case 'x': return Escape(in, length, 2); + case 'u': return Escape(in, length, 4); + case 'U': return Escape(in, length, 8); + } + + throw UnknownEscapeSequence(ch); + } + } +} diff --git a/exp.h b/exp.h index bc2c59b91b..3ad948fe16 100644 --- a/exp.h +++ b/exp.h @@ -1,6 +1,8 @@ #pragma once #include "regex.h" +#include +#include namespace YAML { @@ -13,6 +15,8 @@ namespace YAML const RegEx Blank = RegEx(' ') || RegEx('\t'); const RegEx Break = RegEx('\n'); const RegEx BlankOrBreak = Blank || Break; + const RegEx Digit = RegEx('0', '9'); + const RegEx Hex = Digit || RegEx('A', 'F') || RegEx('a', 'f'); // actual tags @@ -30,11 +34,17 @@ namespace YAML // . Can never start with any of , [ ] { } # & * ! | > \' \" % @ ` // . In the block context - ? : must be not be followed with a space. // . In the flow context ? : are illegal and - must not be followed with a space. - const RegEx PlainScalar = !(BlankOrBreak || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:") + Blank)), + const RegEx PlainScalar = !(BlankOrBreak || RegEx(",[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx("-?:", REGEX_OR) + Blank)), PlainScalarInFlow = !(BlankOrBreak || RegEx("?:,[]{}#&*!|>\'\"%@`", REGEX_OR) || (RegEx('-') + Blank)); const RegEx IllegalColonInScalar = RegEx(':') + !BlankOrBreak; const RegEx EndScalar = RegEx(':') + BlankOrBreak, - EndScalarInFlow = (RegEx(':') + BlankOrBreak) || RegEx(",:?[]{}"); + EndScalarInFlow = (RegEx(':') + BlankOrBreak) || RegEx(",:?[]{}", REGEX_OR); + + const RegEx EscSingleQuote = RegEx("\'\'"); + const RegEx EscBreak = RegEx('\\') + Break; + + // and some functions + std::string Escape(std::istream& in, int& length); } namespace Keys diff --git a/regex.cpp b/regex.cpp index f7830b7f02..342c0bd90e 100644 --- a/regex.cpp +++ b/regex.cpp @@ -35,7 +35,7 @@ namespace YAML RegEx::RegEx(const std::string& str, REGEX_OP op): m_op(op), m_pOp(0) { for(unsigned i=0;i 0) { + m_flowLevel--; + // TODO: Pop simple key + } + } + // temporary function for testing void Scanner::Scan() { diff --git a/scanner.h b/scanner.h index 8db99be793..df8994488f 100644 --- a/scanner.h +++ b/scanner.h @@ -20,6 +20,8 @@ namespace YAML void ScanToNextToken(); void PushIndentTo(int column, bool sequence); void PopIndentTo(int column); + void IncreaseFlowLevel(); + void DecreaseFlowLevel(); void Scan(); @@ -37,6 +39,16 @@ namespace YAML bool IsValue(); bool IsPlainScalar(); + struct WhitespaceInfo { + WhitespaceInfo(); + void AddBlank(char ch); + void AddBreak(const std::string& line); + std::string Join(); + + bool leadingBlanks; + std::string whitespace, leadingBreaks, trailingBreaks; + }; + template void ScanAndEnqueue(T *pToken); template T *ScanToken(T *pToken); diff --git a/scantoken.cpp b/scantoken.cpp index 0b24d9a17a..48141fd10b 100644 --- a/scantoken.cpp +++ b/scantoken.cpp @@ -64,8 +64,8 @@ namespace YAML template <> FlowSeqStartToken *Scanner::ScanToken(FlowSeqStartToken *pToken) { // TODO: "save simple key" - // TODO: increase flow level + IncreaseFlowLevel(); m_simpleKeyAllowed = true; // eat @@ -77,8 +77,8 @@ namespace YAML template <> FlowMapStartToken *Scanner::ScanToken(FlowMapStartToken *pToken) { // TODO: "save simple key" - // TODO: increase flow level + IncreaseFlowLevel(); m_simpleKeyAllowed = true; // eat @@ -90,8 +90,8 @@ namespace YAML template <> FlowSeqEndToken *Scanner::ScanToken(FlowSeqEndToken *pToken) { // TODO: "remove simple key" - // TODO: decrease flow level + DecreaseFlowLevel(); m_simpleKeyAllowed = false; // eat @@ -103,8 +103,8 @@ namespace YAML template <> FlowMapEndToken *Scanner::ScanToken(FlowMapEndToken *pToken) { // TODO: "remove simple key" - // TODO: decrease flow level + DecreaseFlowLevel(); m_simpleKeyAllowed = false; // eat @@ -210,8 +210,8 @@ namespace YAML m_simpleKeyAllowed = false; // now eat and store the scalar - std::string scalar, whitespace, leadingBreaks, trailingBreaks; - bool leadingBlanks = false; + std::string scalar; + WhitespaceInfo info; while(INPUT) { // doc start/end tokens @@ -234,26 +234,6 @@ namespace YAML if(m_flowLevel == 0 && Exp::EndScalar.Matches(INPUT)) break; - // join whitespace - if(leadingBlanks) { - if(Exp::Break.Matches(leadingBreaks)) { - // fold line break? - if(trailingBreaks.empty()) - scalar += ' '; - else - scalar += trailingBreaks; - } else { - scalar += leadingBreaks + trailingBreaks; - } - - leadingBlanks = false; - leadingBreaks = ""; - trailingBreaks = ""; - } else if(!whitespace.empty()) { - scalar += whitespace; - whitespace = ""; - } - // finally, read the character! scalar += GetChar(); } @@ -266,37 +246,29 @@ namespace YAML while(INPUT && Exp::BlankOrBreak.Matches(INPUT)) { if(Exp::Blank.Matches(INPUT)) { // can't use tabs as indentation! only spaces! - if(INPUT.peek() == '\t' && leadingBlanks && m_column <= m_indents.top()) + if(INPUT.peek() == '\t' && info.leadingBlanks && m_column <= m_indents.top()) throw IllegalTabInScalar(); - // maybe store this character - if(!leadingBlanks) - whitespace += GetChar(); - else - Eat(1); - } else { + info.AddBlank(GetChar()); + } else { // we know it's a line break; see how many characters to read int n = Exp::Break.Match(INPUT); std::string line = GetChar(n); - - // where to store this character? - if(!leadingBlanks) { - leadingBlanks = true; - whitespace = ""; - leadingBreaks += line; - } else - trailingBreaks += line; + info.AddBreak(line); } } - // and finally break if we're below the indentation level + // break if we're below the indentation level if(m_flowLevel == 0 && m_column <= m_indents.top()) break; + + // finally join whitespace + scalar += info.Join(); } // now modify our token pToken->value = scalar; - if(leadingBlanks) + if(info.leadingBlanks) m_simpleKeyAllowed = true; return pToken; @@ -305,6 +277,128 @@ namespace YAML // QuotedScalarToken template <> QuotedScalarToken *Scanner::ScanToken(QuotedScalarToken *pToken) { + // TODO: "save simple key" + + m_simpleKeyAllowed = false; + + // eat single or double quote + char quote = GetChar(); + bool single = (quote == '\''); + + // now eat and store the scalar + std::string scalar; + WhitespaceInfo info; + + while(INPUT) { + if(IsDocumentStart() || IsDocumentEnd()) + throw DocIndicatorInQuote(); + + if(INPUT.peek() == EOF) + throw EOFInQuote(); + + // first eat non-blanks + while(INPUT && !Exp::BlankOrBreak.Matches(INPUT)) { + // escaped single quote? + if(single && Exp::EscSingleQuote.Matches(INPUT)) { + int n = Exp::EscSingleQuote.Match(INPUT); + scalar += GetChar(n); + continue; + } + + // is the quote ending? + if(INPUT.peek() == (single ? '\'' : '\"')) + break; + + // escaped newline? + if(Exp::EscBreak.Matches(INPUT)) + break; + + // other escape sequence + if(INPUT.peek() == '\\') { + int length = 0; + scalar += Exp::Escape(INPUT, length); + m_column += length; + continue; + } + + // and finally, just add the damn character + scalar += GetChar(); + } + + // is the quote ending? + if(INPUT.peek() == (single ? '\'' : '\"')) { + // eat and go + GetChar(); + break; + } + + // now we eat blanks + while(Exp::BlankOrBreak.Matches(INPUT)) { + if(Exp::Blank.Matches(INPUT)) { + info.AddBlank(GetChar()); + } else { + // we know it's a line break; see how many characters to read + int n = Exp::Break.Match(INPUT); + std::string line = GetChar(n); + info.AddBreak(line); + } + } + + // and finally join the whitespace + scalar += info.Join(); + } + + pToken->value = scalar; return pToken; } + + ////////////////////////////////////////////////////////// + // WhitespaceInfo stuff + + Scanner::WhitespaceInfo::WhitespaceInfo(): leadingBlanks(false) + { + } + + void Scanner::WhitespaceInfo::AddBlank(char ch) + { + if(!leadingBlanks) + whitespace += ch; + } + + void Scanner::WhitespaceInfo::AddBreak(const std::string& line) + { + // where to store this character? + if(!leadingBlanks) { + leadingBlanks = true; + whitespace = ""; + leadingBreaks += line; + } else + trailingBreaks += line; + } + + std::string Scanner::WhitespaceInfo::Join() + { + std::string ret; + + if(leadingBlanks) { + if(Exp::Break.Matches(leadingBreaks)) { + // fold line break? + if(trailingBreaks.empty()) + ret = " "; + else + ret = trailingBreaks; + } else { + ret = leadingBreaks + trailingBreaks; + } + + leadingBlanks = false; + leadingBreaks = ""; + trailingBreaks = ""; + } else if(!whitespace.empty()) { + ret = whitespace; + whitespace = ""; + } + + return ret; + } } diff --git a/test.yaml b/test.yaml index 64d38a7a81..5e865ddfa7 100644 --- a/test.yaml +++ b/test.yaml @@ -1,10 +1,4 @@ --- -- green - eggs, - and - ham! -- eggs # this is really important! -- - cheddar cheese - - american cheese -- bread +- milk and eggs +- [cheddar, american, swiss] ... \ No newline at end of file diff --git a/yaml-reader.vcproj b/yaml-reader.vcproj index 39f42a4bc2..bcb10e88d4 100644 --- a/yaml-reader.vcproj +++ b/yaml-reader.vcproj @@ -169,6 +169,10 @@ RelativePath=".\document.cpp" > + +