#! /usr/local/bin/ruby -Ku ## ymHTML - Simple HTML Parser ## (c) 2003-2007 yoshidam ## You can redistribute it and/or modify it under the same term as Ruby. ## ## Nov 17, 2007 yoshidam version 0.1.13 Windows-1252 ## Oct 30, 2006 yoshidam version 0.1.12 NKF, Iconv ## Sep 23, 2006 yoshidam version 0.1.11 comment end bug fix ## Apr 12, 2006 yoshidam version 0.1.10 forceHTML option ## Mar 7, 2006 yoshidam version 0.1.9 iso-2022-jp bug fix ## Nov 15, 2005 yoshidam version 0.1.8 table border ## Apr 6, 2004 yoshidam version 0.1.7 InputStream ## Mar 10, 2004 yoshidam version 0.1.6 exception, InputStream ## Sep 17, 2003 yoshidam version 0.1.5 bug fix ## Apr 05, 2003 yoshidam version 0.1.4 ## Apr 04, 2003 yoshidam version 0.1.3 ## Apr 02, 2003 yoshidam version 0.1.2 ## Mar 27, 2003 yoshidam version 0.1.1 ## Mar 26, 2003 yoshidam version 0.1.0 module YmHTML VERSION = 0.113 class Error < StandardError end class ParseError < Error end class EncodingError < Error end class Parser HEAD_MISC = "script|style|meta|link|object" HEADING = "h1|h2|h3|h4|h5|h6" LIST = "ul|ol|dir|menu" PREFORMATTED = "pre" FONTSTYLE = "tt|i|b|u|s|strike|big|small" PHRASE = "em|strong|dfn|code|samp|kbd|var|cite|abbr|acronym" SPECIAL = "a|img|applet|object|font|basefont|br|script|map|q|sub|sup|span|bdo|iframe" FORMCTRL = "input|select|textarea|label|button" INLINE = "#{FONTSTYLE}|#{PHRASE}|#{SPECIAL}|#{FORMCTRL}|ins|del" BLOCK = "p|#{HEADING}|#{LIST}|#{PREFORMATTED}|dl|div|center|noscript|noframes|blockquote|form|isindex|hr|table|fieldset|address|ins|del" FLOW = "#{BLOCK}|#{INLINE}" EMPTY = '' ContentList = {} OpenElements = [ ## ['omitted tag', 'outer', 'inner'] ['html', nil, /^(head|body)$/u], ['head', 'html', /^(title|base|#{HEAD_MISC})$/u], ['body', 'html', /^(#{BLOCK}|script|ins|del)$/u], ['body', 'noframes', /^(#{BLOCK}|script|ins|del)$/u], ['tbody', 'table', /^tr$/], ## invalid omissions ['tr', 'tbody', /^td$/], ['dd', 'dl', /^(#{FLOW})$/], ['td', 'tr', /^(#{FLOW})$/], ['ul', proc {|p| p !~ /^(ul|ol|li)$/}, /^li$/], ] HAVE_PCDATA = /^(option|textarea|fieldset|title|#{FONTSTYLE}|#{PHRASE}|sub|sup|span|bdo|font|address|a|p|#{HEADING}|pre|q|dt|label|legend|caption|body|div|center|object|applet|blockquote|ins|del|dd|li|form|button|th|td|iframe|noscript)$/u ATTR_NAME = { 'table' => [ ['frame', /^(void|above|below|hsides|lhs|rhs|vsides|box|border)$/ ] ] } private def havePCDATA?(name) name =~ HAVE_PCDATA end def guessOmittedTag(parent, child) OpenElements.each do |e, p, c| if (c.is_a?(Regexp) && child =~ c) || (c.is_a?(Proc) && c.call(child)) if (p .nil? && parent.nil?) || (p.is_a?(String) && p == parent) || (p.is_a?(Proc) && p.call(parent)) return [e] elsif !e.is_a?(Proc) && !c.is_a?(Proc) if ret = guessOmittedTag(parent, e) return ret.push(e) end end end end return nil end def self.setContentList(elements, content) elements.split('|').each do |name| ContentList[name] = Regexp.new("^(#{content})$", nil, 'u') end end setContentList(PHRASE, INLINE) # setContentList('body', "#{BLOCK}|script") setContentList('body', FLOW) setContentList('p', INLINE) setContentList('dt', INLINE) setContentList('dd', FLOW) setContentList('li', FLOW) setContentList('option', '') setContentList('thead', 'tr') setContentList('tfoot', 'tr') setContentList('tbody', 'tr') setContentList('colgroup', 'col') setContentList('tr', 'th|td') setContentList('th|td', FLOW) setContentList('head', "title|base|#{HEAD_MISC}") setContentList('html', "head|body|frameset") ## empty element setContentList('br|area|link|img|param|hr|input|col|base|meta|basefont|frame|isindex', '') ## elements which cannot omit end tag setContentList("#{FONTSTYLE}|#{PHRASE}", INLINE) setContentList('sub|sup|bdo|font', INLINE) setContentList('address', "#{INLINE}|p") setContentList('div|center', FLOW) setContentList('a', INLINE.sub(/\ba\|/, '')) setContentList('map', "#{BLOCK}|area") setContentList('object|applet', "param|#{FLOW}") setContentList(HEADING, INLINE) setContentList('pre', INLINE) setContentList('q', INLINE) setContentList('blockquote|ins|del', FLOW) setContentList('dl', 'dt|dd') setContentList('ol|ul|dir|menu', 'li') setContentList('form', FLOW) setContentList('label', INLINE) setContentList('select', 'optgroup|option') setContentList('optgroup', 'option') setContentList('textarea', '') setContentList('fieldset', "legend|#{FLOW}") setContentList('legend|caption', INLINE) setContentList('button', FLOW) setContentList('table', 'caption|col|colgroup|thead|tfoot|tbody') setContentList('frameset', 'frameset|frame|noframes') setContentList('iframe', FLOW) setContentList('noframes', "body|#{FLOW}") setContentList('title', '') setContentList('style|script', '') setContentList('noscript', FLOW) def normalizeAttrValue(str) str.gsub(/[\x9\r\n]/u, ' ') end ## expand entityRef/charRef in context text def expandRef(text = nil) return '' if text.nil? ret = [] ret.taint if text.tainted? while text =~ /\&[\#0-9a-zA-Z]+\;?/u before = Regexp.last_match.pre_match ref = Regexp.last_match[0] text = Regexp.last_match.post_match ret.push(before) if before != '' if ref =~ /^\&\#(\d+);?$/u ## Numeric Character Reference (Decimal) ref = [$1.to_i].pack("U") elsif ref =~ /^\&\#x([0-9a-fA-F]+);?$/u ## Numeric Character Reference (Hexadecimal) ref = [$1.hex].pack("U") elsif !@xhtmlp && ref =~ /^\&\#X([0-9a-fA-F]+);?$/u ## Numeric Character Reference (Hexadecimal) ref = [$1.hex].pack("U") else ## Entity Reference # if !checkNameChar(ref.gsub(/\A\&([\#0-9a-zA-Z]+);?\Z/u, '\1')) # raise ParseError.new("illegal entity reference: #{ref.inspect}") # end ref = expandRef(getEntity(ref)) ## expand recursively end ret.push(ref) end ## end of while ret.push(text) if text != '' ret.join('') end ## expand entityRef/charRef in attribute value def expandAttrValue(text = nil) return '' if text.nil? ret = [] ret.taint if text.tainted? text = normalizeAttrValue(text) while text =~ /\&[\#0-9a-zA-Z]+\;?/u before = Regexp.last_match.pre_match ref = Regexp.last_match[0] text = Regexp.last_match.post_match ret.push(before) if before != '' if ref =~ /^\&\#(\d+);?$/u ## Numeric Character Reference (Decimal) ref = [$1.to_i].pack("U") elsif ref =~ /^\&\#x([0-9a-fA-F]+);?$/u ## Numeric Character Reference (Hexadecimal) ref = [$1.hex].pack("U") elsif !@xhtmlp && ref =~ /^\&\#X([0-9a-fA-F]+);?$/u ## Numeric Character Reference (Hexadecimal) ref = [$1.hex].pack("U") else ## Entity Reference # if !checkNameChar(ref.gsub(/\A\&([\#0-9a-zA-Z]+);?\Z/u, '\1')) # raise ParseError.new("illegal entity reference: #{ref.inspect}") # end ref = expandAttrValue(getEntity(ref)) ## expand recursively end ret.push(ref) end ## end of while ret.push(text) if text != '' ret.join('') end def registerEntity(entname, entval) if @entity[entname].nil? @entity[entname] = entval end end def getEntity(entname) name = entname.sub(/^\&?([\#0-9a-zA-Z]+)\;?$/u, '\1') if !@entity[name].nil? return @entity[name] end if @xhtmlp raise ParseError.new("undeclarated entity reference: #{entname.inspect}") end entname.sub(/&/, '&') end def initialize(encoding = nil) @content = '' @pos = -1 @entity = {} @encoding = encoding ? encoding.downcase : nil @forceHTML = false @xhtmlp = false @eliminateWhiteSpace = false registerEntity("quot", """) registerEntity("amp", "&") registerEntity("lt", "<") registerEntity("gt", ">") registerEntity("apos", "'") registerEntity("nbsp", " ") registerEntity("iexcl", "¡") registerEntity("cent", "¢") registerEntity("pound", "£") registerEntity("curren", "¤") registerEntity("yen", "¥") registerEntity("brvbar", "¦") registerEntity("sect", "§") registerEntity("uml", "¨") registerEntity("copy", "©") registerEntity("ordf", "ª") registerEntity("laquo", "«") registerEntity("not", "¬") registerEntity("shy", "­") registerEntity("reg", "®") registerEntity("macr", "¯") registerEntity("deg", "°") registerEntity("plusmn", "±") registerEntity("sup2", "²") registerEntity("sup3", "³") registerEntity("acute", "´") registerEntity("micro", "µ") registerEntity("para", "¶") registerEntity("middot", "·") registerEntity("cedil", "¸") registerEntity("sup1", "¹") registerEntity("ordm", "º") registerEntity("raquo", "»") registerEntity("frac14", "¼") registerEntity("frac12", "½") registerEntity("frac34", "¾") registerEntity("iquest", "¿") registerEntity("Agrave", "À") registerEntity("Aacute", "Á") registerEntity("Acirc", "Â") registerEntity("Atilde", "Ã") registerEntity("Auml", "Ä") registerEntity("Aring", "Å") registerEntity("AElig", "Æ") registerEntity("Ccedil", "Ç") registerEntity("Egrave", "È") registerEntity("Eacute", "É") registerEntity("Ecirc", "Ê") registerEntity("Euml", "Ë") registerEntity("Igrave", "Ì") registerEntity("Iacute", "Í") registerEntity("Icirc", "Î") registerEntity("Iuml", "Ï") registerEntity("ETH", "Ð") registerEntity("Ntilde", "Ñ") registerEntity("Ograve", "Ò") registerEntity("Oacute", "Ó") registerEntity("Ocirc", "Ô") registerEntity("Otilde", "Õ") registerEntity("Ouml", "Ö") registerEntity("times", "×") registerEntity("Oslash", "Ø") registerEntity("Ugrave", "Ù") registerEntity("Uacute", "Ú") registerEntity("Ucirc", "Û") registerEntity("Uuml", "Ü") registerEntity("Yacute", "Ý") registerEntity("THORN", "Þ") registerEntity("szlig", "ß") registerEntity("agrave", "à") registerEntity("aacute", "á") registerEntity("acirc", "â") registerEntity("atilde", "ã") registerEntity("auml", "ä") registerEntity("aring", "å") registerEntity("aelig", "æ") registerEntity("ccedil", "ç") registerEntity("egrave", "è") registerEntity("eacute", "é") registerEntity("ecirc", "ê") registerEntity("euml", "ë") registerEntity("igrave", "ì") registerEntity("iacute", "í") registerEntity("icirc", "î") registerEntity("iuml", "ï") registerEntity("eth", "ð") registerEntity("ntilde", "ñ") registerEntity("ograve", "ò") registerEntity("oacute", "ó") registerEntity("ocirc", "ô") registerEntity("otilde", "õ") registerEntity("ouml", "ö") registerEntity("divide", "÷") registerEntity("oslash", "ø") registerEntity("ugrave", "ù") registerEntity("uacute", "ú") registerEntity("ucirc", "û") registerEntity("uuml", "ü") registerEntity("yacute", "ý") registerEntity("thorn", "þ") registerEntity("yuml", "ÿ") registerEntity("fnof", "ƒ") registerEntity("Alpha", "Α") registerEntity("Beta", "Β") registerEntity("Gamma", "Γ") registerEntity("Delta", "Δ") registerEntity("Epsilon", "Ε") registerEntity("Zeta", "Ζ") registerEntity("Eta", "Η") registerEntity("Theta", "Θ") registerEntity("Iota", "Ι") registerEntity("Kappa", "Κ") registerEntity("Lambda", "Λ") registerEntity("Mu", "Μ") registerEntity("Nu", "Ν") registerEntity("Xi", "Ξ") registerEntity("Omicron", "Ο") registerEntity("Pi", "Π") registerEntity("Rho", "Ρ") registerEntity("Sigma", "Σ") registerEntity("Tau", "Τ") registerEntity("Upsilon", "Υ") registerEntity("Phi", "Φ") registerEntity("Chi", "Χ") registerEntity("Psi", "Ψ") registerEntity("Omega", "Ω") registerEntity("alpha", "α") registerEntity("beta", "β") registerEntity("gamma", "γ") registerEntity("delta", "δ") registerEntity("epsilon", "ε") registerEntity("zeta", "ζ") registerEntity("eta", "η") registerEntity("theta", "θ") registerEntity("iota", "ι") registerEntity("kappa", "κ") registerEntity("lambda", "λ") registerEntity("mu", "μ") registerEntity("nu", "ν") registerEntity("xi", "ξ") registerEntity("omicron", "ο") registerEntity("pi", "π") registerEntity("rho", "ρ") registerEntity("sigmaf", "ς") registerEntity("sigma", "σ") registerEntity("tau", "τ") registerEntity("upsilon", "υ") registerEntity("phi", "φ") registerEntity("chi", "χ") registerEntity("psi", "ψ") registerEntity("omega", "ω") registerEntity("thetasym", "ϑ") registerEntity("upsih", "ϒ") registerEntity("piv", "ϖ") registerEntity("bull", "•") registerEntity("hellip", "…") registerEntity("prime", "′") registerEntity("Prime", "″") registerEntity("oline", "‾") registerEntity("frasl", "⁄") registerEntity("weierp", "℘") registerEntity("image", "ℑ") registerEntity("real", "ℜ") registerEntity("trade", "™") registerEntity("alefsym", "ℵ") registerEntity("larr", "←") registerEntity("uarr", "↑") registerEntity("rarr", "→") registerEntity("darr", "↓") registerEntity("harr", "↔") registerEntity("crarr", "↵") registerEntity("lArr", "⇐") registerEntity("uArr", "⇑") registerEntity("rArr", "⇒") registerEntity("dArr", "⇓") registerEntity("hArr", "⇔") registerEntity("forall", "∀") registerEntity("part", "∂") registerEntity("exist", "∃") registerEntity("empty", "∅") registerEntity("nabla", "∇") registerEntity("isin", "∈") registerEntity("notin", "∉") registerEntity("ni", "∋") registerEntity("prod", "∏") registerEntity("sum", "∑") registerEntity("minus", "−") registerEntity("lowast", "∗") registerEntity("radic", "√") registerEntity("prop", "∝") registerEntity("infin", "∞") registerEntity("ang", "∠") registerEntity("and", "∧") registerEntity("or", "∨") registerEntity("cap", "∩") registerEntity("cup", "∪") registerEntity("int", "∫") registerEntity("there4", "∴") registerEntity("sim", "∼") registerEntity("cong", "≅") registerEntity("asymp", "≈") registerEntity("ne", "≠") registerEntity("equiv", "≡") registerEntity("le", "≤") registerEntity("ge", "≥") registerEntity("sub", "⊂") registerEntity("sup", "⊃") registerEntity("nsub", "⊄") registerEntity("sube", "⊆") registerEntity("supe", "⊇") registerEntity("oplus", "⊕") registerEntity("otimes", "⊗") registerEntity("perp", "⊥") registerEntity("sdot", "⋅") registerEntity("lceil", "⌈") registerEntity("rceil", "⌉") registerEntity("lfloor", "⌊") registerEntity("rfloor", "⌋") registerEntity("lang", "〈") registerEntity("rang", "〉") registerEntity("loz", "◊") registerEntity("spades", "♠") registerEntity("clubs", "♣") registerEntity("hearts", "♥") registerEntity("diams", "♦") ## registerEntity("quot", """) ## registerEntity("amp", "&#38;") ## registerEntity("lt", "&#60;") ## registerEntity("gt", ">") ## registerEntity("apos", "'") registerEntity("OElig", "Œ") registerEntity("oelig", "œ") registerEntity("Scaron", "Š") registerEntity("scaron", "š") registerEntity("Yuml", "Ÿ") registerEntity("circ", "ˆ") registerEntity("tilde", "˜") registerEntity("ensp", " ") registerEntity("emsp", " ") registerEntity("thinsp", " ") registerEntity("zwnj", "‌") registerEntity("zwj", "‍") registerEntity("lrm", "‎") registerEntity("rlm", "‏") registerEntity("ndash", "–") registerEntity("mdash", "—") registerEntity("lsquo", "‘") registerEntity("rsquo", "’") registerEntity("sbquo", "‚") registerEntity("ldquo", "“") registerEntity("rdquo", "”") registerEntity("bdquo", "„") registerEntity("dagger", "†") registerEntity("Dagger", "‡") registerEntity("permil", "‰") registerEntity("lsaquo", "‹") registerEntity("rsaquo", "›") registerEntity("euro", "€") end ## parse token def nextToken token = '' if @xhtmlp elementpat = /[\<\>\[\]\=\/]/u else elementpat = /[\<\>\[\]\=]/u end while !(c = @content[@pos, 1]).nil? if c == '' ## EOF return token if token != '' return nil elsif c == '-' && token == '/u, @pos + 1) raise ParseError.new("comment parse error") unless commentpos @content[commentpos..-1] =~ /--[ \t\n\r]>/u len = $&.length token += @content[@pos, commentpos - @pos + len] @pos = commentpos + len return token elsif c == '-' && token == '-' ## Comment in decl commentpos = @content.index(/--/u, @pos + 1) raise ParseError.new("comment parse error") unless commentpos token += @content[@pos, commentpos - @pos + 2] @pos = commentpos + 2 return token elsif c == '?' && token == '<' ## PI pipos = @content.index("?>", @pos + 1) raise ParseError.new("PI parse error") unless pipos token += @content[@pos, pipos - @pos + 2] @pos = pipos + 2 return token elsif c =~ /[ \t\n\r]/u ## White Space return token if token != '' @pos += 1 next elsif c =~ elementpat ## Element return token if token != '' if c == '=' || c == '>' @pos += 1 return c end @pos += 1 token = c next ## Literal elsif token == '' && (c == '"' || c == "'") quotpos = @content.index(c, @pos + 1) raise ParseError.new("literal parse error") unless quotpos token = @content[@pos, quotpos - @pos + 1] @pos = quotpos + 1 return token ## Others else token += c @pos += 1 next end end nil end def checkNameChar(str) str =~ /\A([^\W0-9]|:)[\w\.\-:]*\Z/u end ## parse DTD def parseDTD(dtd) @pos -= dtd.length start = @pos if (token = nextToken) != '' && token != '[' extid = token token = nextToken end end ## skip internel DTD subset if token == '[' while (token = nextToken) if token == ']' token = nextToken break end end end if token != '>' raise ParseError.new("DOCTYPE parse error") end if !@forceHTML && pubid =~ /^[\"\']-\/\/W3C\/\/DTD XHTML / @xhtmlp = true end # p [doctype, pubid, extid] @content[start + 1, @pos - start - 2] ## chop the first '<' and ## the last '>' end def isEmptyElement(name) return false if @xhtmlp name =~ /^(br|area|link|img|param|hr|input|col|base|meta|basefont|frame|isindex)$/ end def isCdataElement(name) return false if @xhtmlp name =~ /^(style|script)$/ end ## parse Element start tag def parseElementStartTag(elem) empty = nil attrs = {} rawattrs = {} ## rewind @pos -= elem.length start = @pos name = nextToken if !checkNameChar(name) ## rollback @pos = start return nil # raise ParseError.new("illegal element name: #{name.inspect}") end name.downcase! unless @xhtmlp token = nextToken while !token.nil? break if token == '>' if token == '/' ## empty element tag token = nextToken if token != '>' ## rollback @pos = start return nil # raise ParseError.new("element parse error") end empty = 1 break end attrname = token # if !checkNameChar(attrname) # raise ParseError.new("illegal attribute name: #{attrname.inspect}") # end attrname.downcase! unless @xhtmlp token = nextToken if token != '=' raise ParseError.new("attribute parse error") if @xhtmlp attrvalue = attrname if ATTR_NAME[name] for n, v in ATTR_NAME[name] if attrvalue =~ v attrname = n break end end end else attrvalue = nextToken token = nextToken end if attrvalue !~ /\A([\'\"]?)([\w\W]*)\1\Z/u raise ParseError.new("attribute parse error: #{attrvalue.inspect}") end # if attrs.include?(attrname) # raise ParseError.new("dupulicate attribute: #{attrname.inspect}") # end attrs[attrname] = expandAttrValue($2) if @eliminateWhiteSpace attrs[attrname].gsub!(/[ \x9\n]+/, ' ') attrs[attrname].gsub!(/\A +| +\z/, '') end rawattrs[attrname] = attrvalue end empty = 1 if isEmptyElement(name) [name, attrs, empty, rawattrs] end def expect(key, include = 0) token = nil pos = @content.index(key, @pos) if pos.nil? token = @content[@pos..-1] @pos = -1 return token elsif key.is_a?(Regexp) && include > 0 @content[pos..-1] =~ key include = $&.length end token = @content[@pos, pos - @pos + include] @pos = pos + include token end def parseTag(e = nil) c = @content[@pos, 1] if !e.nil? token = expect(Regexp.new(e, 'i', 'u'), 0) return [:CDATA, token] elsif c == '<' ## Markup token = expect(">", 1) if token[-1, 1] != '>' return [:MARKUP, nil] end return [:MARKUP, token] else ## CharData return [:PCDATA, expect("<")] end end def normalizeLineBreak(str) return nil unless str str.gsub(/\x0d\x0a|\x0d/u, "\x0a") end def checkContent(parent, child) return true unless ContentList.include?(child) return true unless ContentList.include?(parent) return true if child =~ ContentList[parent] false end def doPreParseProcessing ## normalize line break @content[@pos..-1] = normalizeLineBreak(@content[@pos..-1]) end public attr_accessor :eliminateWhiteSpace attr_accessor :forceHTML ZenkakuChar = [0x3000, ?-, 0x9fff, 0xf900, ?-, 0xfaff] IgnorableSpaces = /([#{ZenkakuChar.pack('U*')}])\n+([#{ZenkakuChar.pack('U*')}])/u def parse(content, &block) @content = content if !content.is_a?(InputStream) && content.respond_to?('read') @content = InputStream.new(content) end @block = block @pos = 0 estack = [] if @content.nil? return 0 end if @encoding && @content.is_a?(InputStream) @content.setEncoding(@encoding) end lastContent = nil nextContent = nil while @pos >= 0 ttype, part = parseTag(nextContent) oldpart = part nextContent = nil if part.nil? raise ParseError.new("unexpected EOF") elsif ttype == :PCDATA ## #PCDATA if !lastContent doPreParseProcessing part = normalizeLineBreak(part) end lastContent = :PCDATA if estack.length == 0 next if part =~ /\A[ \x9\r\n]*\Z/u raise ParseError.new("cdata must be in document element: #{part.inspect}") end if !havePCDATA?(estack[-1]) next if part =~ /\A[ \x9\r\n]*\Z/u # raise ParseError.new("cannot have #PCDATA in #{estack[-1]}") end part = expandRef(part) if @eliminateWhiteSpace && estack[-1] != 'pre' part.gsub!(IgnorableSpaces, '\1\2') part.gsub!(/[ \x9\n]+/, ' ') part.gsub!(/\A +| +\z/, '') end if part != '' if block_given? @block.call(:CDATA, nil, part) else character(part) end end next elsif ttype == :CDATA lastContent = :CDATA ## CDATA if block_given? @block.call(:CDATA, nil, part) else character(part) end next else first = part[1] if first == ?? && part =~ /\A<\?xml[ \t\n\r\?]/u && lastContent.nil? ## XML Declaration if (part =~ /\A<\?xml([ \t\n\r]+version[ \t\n\r]*=[ \t\n\r]*(['"])([a-zA-Z0-9_.:\-]+)\2)?([ \t\n\r]+encoding[ \t\n\r]*=[ \t\n\r]*(['"])(.*?)\5)?([ \t\n\r]+standalone[ \t\n\r]*=[ \t\n\r]*(['"])(yes|no)\8)?[ \t\n\r]*\?>/u) != 0 raise ParseError.new("illegal XML declaration") end @xhtmlp = true if !@forceHTML version = $3 encoding = $6 standalone = $9 if !version raise ParseError.new("invalid XML declaration") end if version != '1.0' && version != '1.1' raise ParseError.new("version #{version} not supported") end if block_given? @block.call(:XML_DECL, nil, [version, encoding, standalone]) else xmlDecl(version, encoding, standalone) end if encoding && @content.is_a?(InputStream) @content.setEncoding(encoding.downcase) end next end ## pre-parse processing after XML Declaration if !lastContent || lastContent == :XML_DECL doPreParseProcessing part = normalizeLineBreak(part) next if lastContent end if first == ?? ## Processing Instruction lastContent = :PI if part !~ /\?>\Z/u part += expect("?>", 2) if part !~ /\?>\Z/u raise ParseError.new("processing instruction data expected") end end part = part[2..-3] ## strip "" part =~ /\A([^ \t\n\r]+)([ \t\n\r]+(.*))?\Z/mu name = $1 data = $3.to_s if @xhtmlp && name =~ /\Axml\z/i raise ParseError.new("illegal PI name: #{name.inspect}") end ##!!! chack name if block_given? @block.call(:PI, name, data) else processingInstruction(name, data) end next elsif first == ?! if part =~ /\A\Z/u raise ParseError.new("comment must end with \"-->\"") end part =~ /\A