#! /usr/local/bin/ruby
## ymHTML - Simple HTML Parser
## (c) 2003-2010 yoshidam
## You can redistribute it and/or modify it under the same term as Ruby.
##
## Mar 3, 2009 yoshidam version 0.1.16 parseDTD bug fix
## Aug 6, 2009 yoshidam version 0.1.15 encoding bug fix
## Aug 1, 2009 yoshidam version 0.1.14 Ruby 1.9
## Nov 17, 2007 yoshidam version 0.1.13 Windows-1252
## Oct 30, 2006 yoshidam version 0.1.12 NKF, Iconv
## Sep 23, 2006 yoshidam version 0.1.11 comment end bug fix
## Apr 12, 2006 yoshidam version 0.1.10 forceHTML option
## Mar 7, 2006 yoshidam version 0.1.9 iso-2022-jp bug fix
## Nov 15, 2005 yoshidam version 0.1.8 table border
## Apr 6, 2004 yoshidam version 0.1.7 InputStream
## Mar 10, 2004 yoshidam version 0.1.6 exception, InputStream
## Sep 17, 2003 yoshidam version 0.1.5 bug fix
## Apr 05, 2003 yoshidam version 0.1.4
## Apr 04, 2003 yoshidam version 0.1.3
## Apr 02, 2003 yoshidam version 0.1.2
## Mar 27, 2003 yoshidam version 0.1.1
## Mar 26, 2003 yoshidam version 0.1.0
module YmHTML
VERSION = 0.116
class Error < StandardError
end
class ParseError < Error
end
class EncodingError < Error
end
class Parser
HEAD_MISC = "script|style|meta|link|object"
HEADING = "h1|h2|h3|h4|h5|h6"
LIST = "ul|ol|dir|menu"
PREFORMATTED = "pre"
FONTSTYLE = "tt|i|b|u|s|strike|big|small"
PHRASE = "em|strong|dfn|code|samp|kbd|var|cite|abbr|acronym"
SPECIAL = "a|img|applet|object|font|basefont|br|script|map|q|sub|sup|span|bdo|iframe"
FORMCTRL = "input|select|textarea|label|button"
INLINE = "#{FONTSTYLE}|#{PHRASE}|#{SPECIAL}|#{FORMCTRL}|ins|del"
BLOCK = "p|#{HEADING}|#{LIST}|#{PREFORMATTED}|dl|div|center|noscript|noframes|blockquote|form|isindex|hr|table|fieldset|address|ins|del"
FLOW = "#{BLOCK}|#{INLINE}"
EMPTY = ''
ContentList = {}
OpenElements = [
## ['omitted tag', 'outer', 'inner']
['html', nil, /^(head|body)$/u],
['head', 'html', /^(title|base|#{HEAD_MISC})$/u],
['body', 'html', /^(#{BLOCK}|script|ins|del)$/u],
['body', 'noframes', /^(#{BLOCK}|script|ins|del)$/u],
['tbody', 'table', /^tr$/],
## invalid omissions
['tr', 'tbody', /^td$/],
['dd', 'dl', /^(#{FLOW})$/],
['td', 'tr', /^(#{FLOW})$/],
['ul', proc {|p| p !~ /^(ul|ol|li)$/}, /^li$/],
]
HAVE_PCDATA = /^(option|textarea|fieldset|title|#{FONTSTYLE}|#{PHRASE}|sub|sup|span|bdo|font|address|a|p|#{HEADING}|pre|q|dt|label|legend|caption|body|div|center|object|applet|blockquote|ins|del|dd|li|form|button|th|td|iframe|noscript)$/u
ATTR_NAME = { 'table' => [
['frame', /^(void|above|below|hsides|lhs|rhs|vsides|box|border)$/ ]
]
}
def self.charCode(c) c.class == String ? c.ord : c end
MINUS = charCode(?-)
def self.regnew(s, opt = nil)
if s.respond_to?(:force_encoding)
Regexp.new(s.force_encoding(::Encoding::UTF_8), opt)
else
Regexp.new(s, opt, 'u')
end
end
private
def havePCDATA?(name)
name =~ HAVE_PCDATA
end
def guessOmittedTag(parent, child)
OpenElements.each do |e, p, c|
if (c.is_a?(Regexp) && child =~ c) ||
(c.is_a?(Proc) && c.call(child))
if (p .nil? && parent.nil?) ||
(p.is_a?(String) && p == parent) ||
(p.is_a?(Proc) && p.call(parent))
return [e]
elsif !e.is_a?(Proc) && !c.is_a?(Proc)
if ret = guessOmittedTag(parent, e)
return ret.push(e)
end
end
end
end
return nil
end
def self.setContentList(elements, content)
elements.split('|').each do |name|
ContentList[name] = regnew("^(#{content})$", nil)
end
end
setContentList(PHRASE, INLINE)
# setContentList('body', "#{BLOCK}|script")
setContentList('body', FLOW)
setContentList('p', INLINE)
setContentList('dt', INLINE)
setContentList('dd', FLOW)
setContentList('li', FLOW)
setContentList('option', '')
setContentList('thead', 'tr')
setContentList('tfoot', 'tr')
setContentList('tbody', 'tr')
setContentList('colgroup', 'col')
setContentList('tr', 'th|td')
setContentList('th|td', FLOW)
setContentList('head', "title|base|#{HEAD_MISC}")
setContentList('html', "head|body|frameset")
## empty element
setContentList('br|area|link|img|param|hr|input|col|base|meta|basefont|frame|isindex', '')
## elements which cannot omit end tag
setContentList("#{FONTSTYLE}|#{PHRASE}", INLINE)
setContentList('sub|sup|bdo|font', INLINE)
setContentList('address', "#{INLINE}|p")
setContentList('div|center', FLOW)
setContentList('a', INLINE.sub(/\ba\|/, ''))
setContentList('map', "#{BLOCK}|area")
setContentList('object|applet', "param|#{FLOW}")
setContentList(HEADING, INLINE)
setContentList('pre', INLINE)
setContentList('q', INLINE)
setContentList('blockquote|ins|del', FLOW)
setContentList('dl', 'dt|dd')
setContentList('ol|ul|dir|menu', 'li')
setContentList('form', FLOW)
setContentList('label', INLINE)
setContentList('select', 'optgroup|option')
setContentList('optgroup', 'option')
setContentList('textarea', '')
setContentList('fieldset', "legend|#{FLOW}")
setContentList('legend|caption', INLINE)
setContentList('button', FLOW)
setContentList('table', 'caption|col|colgroup|thead|tfoot|tbody')
setContentList('frameset', 'frameset|frame|noframes')
setContentList('iframe', FLOW)
setContentList('noframes', "body|#{FLOW}")
setContentList('title', '')
setContentList('style|script', '')
setContentList('noscript', FLOW)
def normalizeAttrValue(str)
str.gsub(/[\x9\r\n]/, ' ')
end
def numToChar(num)
char = [num].pack("U")
char.force_encoding(::Encoding::UTF_8) if char.respond_to?(:force_encoding)
char
end
## expand entityRef/charRef in content text
def expandRef(text = nil)
return '' if text.nil?
ret = []
ret.taint if text.tainted?
while text =~ /\&[\#0-9a-zA-Z]+\;?/
before = Regexp.last_match.pre_match
ref = Regexp.last_match[0]
text = Regexp.last_match.post_match
ret.push(before) if before != ''
if ref =~ /^\&\#([0-9]+);?$/
## Numeric Character Reference (Decimal)
ref = numToChar($1.to_i)
elsif ref =~ /^\&\#x([0-9a-fA-F]+);?$/
## Numeric Character Reference (Hexadecimal)
ref = numToChar($1.hex)
elsif !@xhtmlp && ref =~ /^\&\#X([0-9a-fA-F]+);?$/
## Numeric Character Reference (Hexadecimal)
ref = numToChar($1.hex)
else
## Entity Reference
# if !checkNameChar(ref.gsub(/\A\&([\#0-9a-zA-Z]+);?\Z/u, '\1'))
# raise ParseError.new("illegal entity reference: #{ref.inspect}")
# end
ref = expandRef(getEntity(ref)) ## expand recursively
end
ret.push(ref)
end ## end of while
ret.push(text) if text != ''
ret.join('')
end
## expand entityRef/charRef in attribute value
def expandAttrValue(text = nil)
return '' if text.nil?
ret = []
ret.taint if text.tainted?
text = normalizeAttrValue(text)
while text =~ /\&[\#0-9a-zA-Z]+\;?/
before = Regexp.last_match.pre_match
ref = Regexp.last_match[0]
text = Regexp.last_match.post_match
ret.push(before) if before != ''
if ref =~ /^\&\#([0-9]+);?$/
## Numeric Character Reference (Decimal)
ref = numToChar($1.to_i)
elsif ref =~ /^\&\#x([0-9a-fA-F]+);?$/
## Numeric Character Reference (Hexadecimal)
ref = numToChar($1.hex)
elsif !@xhtmlp && ref =~ /^\&\#X([0-9a-fA-F]+);?$/
## Numeric Character Reference (Hexadecimal)
ref = numToChar($1.hex)
else
## Entity Reference
# if !checkNameChar(ref.gsub(/\A\&([\#0-9a-zA-Z]+);?\Z/u, '\1'))
# raise ParseError.new("illegal entity reference: #{ref.inspect}")
# end
ref = expandAttrValue(getEntity(ref)) ## expand recursively
end
ret.push(ref)
end ## end of while
ret.push(text) if text != ''
ret.join('')
end
def registerEntity(entname, entval)
if @entity[entname].nil?
@entity[entname] = entval
end
end
def getEntity(entname)
name = entname.sub(/^\&?([\#0-9a-zA-Z]+)\;?$/u, '\1')
if !@entity[name].nil?
return @entity[name]
end
if @xhtmlp
raise ParseError.new("undeclarated entity reference: #{entname.inspect}")
end
entname.sub(/&/, '&')
end
def initialize(encoding = nil)
@content = ''
@pos = -1
@entity = {}
@encoding = encoding ? encoding.downcase : nil
@forceHTML = false
@xhtmlp = false
@eliminateWhiteSpace = false
registerEntity("quot", """)
registerEntity("amp", "&")
registerEntity("lt", "<")
registerEntity("gt", ">")
registerEntity("apos", "'")
registerEntity("nbsp", " ")
registerEntity("iexcl", "¡")
registerEntity("cent", "¢")
registerEntity("pound", "£")
registerEntity("curren", "¤")
registerEntity("yen", "¥")
registerEntity("brvbar", "¦")
registerEntity("sect", "§")
registerEntity("uml", "¨")
registerEntity("copy", "©")
registerEntity("ordf", "ª")
registerEntity("laquo", "«")
registerEntity("not", "¬")
registerEntity("shy", "")
registerEntity("reg", "®")
registerEntity("macr", "¯")
registerEntity("deg", "°")
registerEntity("plusmn", "±")
registerEntity("sup2", "²")
registerEntity("sup3", "³")
registerEntity("acute", "´")
registerEntity("micro", "µ")
registerEntity("para", "¶")
registerEntity("middot", "·")
registerEntity("cedil", "¸")
registerEntity("sup1", "¹")
registerEntity("ordm", "º")
registerEntity("raquo", "»")
registerEntity("frac14", "¼")
registerEntity("frac12", "½")
registerEntity("frac34", "¾")
registerEntity("iquest", "¿")
registerEntity("Agrave", "À")
registerEntity("Aacute", "Á")
registerEntity("Acirc", "Â")
registerEntity("Atilde", "Ã")
registerEntity("Auml", "Ä")
registerEntity("Aring", "Å")
registerEntity("AElig", "Æ")
registerEntity("Ccedil", "Ç")
registerEntity("Egrave", "È")
registerEntity("Eacute", "É")
registerEntity("Ecirc", "Ê")
registerEntity("Euml", "Ë")
registerEntity("Igrave", "Ì")
registerEntity("Iacute", "Í")
registerEntity("Icirc", "Î")
registerEntity("Iuml", "Ï")
registerEntity("ETH", "Ð")
registerEntity("Ntilde", "Ñ")
registerEntity("Ograve", "Ò")
registerEntity("Oacute", "Ó")
registerEntity("Ocirc", "Ô")
registerEntity("Otilde", "Õ")
registerEntity("Ouml", "Ö")
registerEntity("times", "×")
registerEntity("Oslash", "Ø")
registerEntity("Ugrave", "Ù")
registerEntity("Uacute", "Ú")
registerEntity("Ucirc", "Û")
registerEntity("Uuml", "Ü")
registerEntity("Yacute", "Ý")
registerEntity("THORN", "Þ")
registerEntity("szlig", "ß")
registerEntity("agrave", "à")
registerEntity("aacute", "á")
registerEntity("acirc", "â")
registerEntity("atilde", "ã")
registerEntity("auml", "ä")
registerEntity("aring", "å")
registerEntity("aelig", "æ")
registerEntity("ccedil", "ç")
registerEntity("egrave", "è")
registerEntity("eacute", "é")
registerEntity("ecirc", "ê")
registerEntity("euml", "ë")
registerEntity("igrave", "ì")
registerEntity("iacute", "í")
registerEntity("icirc", "î")
registerEntity("iuml", "ï")
registerEntity("eth", "ð")
registerEntity("ntilde", "ñ")
registerEntity("ograve", "ò")
registerEntity("oacute", "ó")
registerEntity("ocirc", "ô")
registerEntity("otilde", "õ")
registerEntity("ouml", "ö")
registerEntity("divide", "÷")
registerEntity("oslash", "ø")
registerEntity("ugrave", "ù")
registerEntity("uacute", "ú")
registerEntity("ucirc", "û")
registerEntity("uuml", "ü")
registerEntity("yacute", "ý")
registerEntity("thorn", "þ")
registerEntity("yuml", "ÿ")
registerEntity("fnof", "ƒ")
registerEntity("Alpha", "Α")
registerEntity("Beta", "Β")
registerEntity("Gamma", "Γ")
registerEntity("Delta", "Δ")
registerEntity("Epsilon", "Ε")
registerEntity("Zeta", "Ζ")
registerEntity("Eta", "Η")
registerEntity("Theta", "Θ")
registerEntity("Iota", "Ι")
registerEntity("Kappa", "Κ")
registerEntity("Lambda", "Λ")
registerEntity("Mu", "Μ")
registerEntity("Nu", "Ν")
registerEntity("Xi", "Ξ")
registerEntity("Omicron", "Ο")
registerEntity("Pi", "Π")
registerEntity("Rho", "Ρ")
registerEntity("Sigma", "Σ")
registerEntity("Tau", "Τ")
registerEntity("Upsilon", "Υ")
registerEntity("Phi", "Φ")
registerEntity("Chi", "Χ")
registerEntity("Psi", "Ψ")
registerEntity("Omega", "Ω")
registerEntity("alpha", "α")
registerEntity("beta", "β")
registerEntity("gamma", "γ")
registerEntity("delta", "δ")
registerEntity("epsilon", "ε")
registerEntity("zeta", "ζ")
registerEntity("eta", "η")
registerEntity("theta", "θ")
registerEntity("iota", "ι")
registerEntity("kappa", "κ")
registerEntity("lambda", "λ")
registerEntity("mu", "μ")
registerEntity("nu", "ν")
registerEntity("xi", "ξ")
registerEntity("omicron", "ο")
registerEntity("pi", "π")
registerEntity("rho", "ρ")
registerEntity("sigmaf", "ς")
registerEntity("sigma", "σ")
registerEntity("tau", "τ")
registerEntity("upsilon", "υ")
registerEntity("phi", "φ")
registerEntity("chi", "χ")
registerEntity("psi", "ψ")
registerEntity("omega", "ω")
registerEntity("thetasym", "ϑ")
registerEntity("upsih", "ϒ")
registerEntity("piv", "ϖ")
registerEntity("bull", "•")
registerEntity("hellip", "…")
registerEntity("prime", "′")
registerEntity("Prime", "″")
registerEntity("oline", "‾")
registerEntity("frasl", "⁄")
registerEntity("weierp", "℘")
registerEntity("image", "ℑ")
registerEntity("real", "ℜ")
registerEntity("trade", "™")
registerEntity("alefsym", "ℵ")
registerEntity("larr", "←")
registerEntity("uarr", "↑")
registerEntity("rarr", "→")
registerEntity("darr", "↓")
registerEntity("harr", "↔")
registerEntity("crarr", "↵")
registerEntity("lArr", "⇐")
registerEntity("uArr", "⇑")
registerEntity("rArr", "⇒")
registerEntity("dArr", "⇓")
registerEntity("hArr", "⇔")
registerEntity("forall", "∀")
registerEntity("part", "∂")
registerEntity("exist", "∃")
registerEntity("empty", "∅")
registerEntity("nabla", "∇")
registerEntity("isin", "∈")
registerEntity("notin", "∉")
registerEntity("ni", "∋")
registerEntity("prod", "∏")
registerEntity("sum", "∑")
registerEntity("minus", "−")
registerEntity("lowast", "∗")
registerEntity("radic", "√")
registerEntity("prop", "∝")
registerEntity("infin", "∞")
registerEntity("ang", "∠")
registerEntity("and", "∧")
registerEntity("or", "∨")
registerEntity("cap", "∩")
registerEntity("cup", "∪")
registerEntity("int", "∫")
registerEntity("there4", "∴")
registerEntity("sim", "∼")
registerEntity("cong", "≅")
registerEntity("asymp", "≈")
registerEntity("ne", "≠")
registerEntity("equiv", "≡")
registerEntity("le", "≤")
registerEntity("ge", "≥")
registerEntity("sub", "⊂")
registerEntity("sup", "⊃")
registerEntity("nsub", "⊄")
registerEntity("sube", "⊆")
registerEntity("supe", "⊇")
registerEntity("oplus", "⊕")
registerEntity("otimes", "⊗")
registerEntity("perp", "⊥")
registerEntity("sdot", "⋅")
registerEntity("lceil", "⌈")
registerEntity("rceil", "⌉")
registerEntity("lfloor", "⌊")
registerEntity("rfloor", "⌋")
registerEntity("lang", "〈")
registerEntity("rang", "〉")
registerEntity("loz", "◊")
registerEntity("spades", "♠")
registerEntity("clubs", "♣")
registerEntity("hearts", "♥")
registerEntity("diams", "♦")
## registerEntity("quot", """)
## registerEntity("amp", "&")
## registerEntity("lt", "<")
## registerEntity("gt", ">")
## registerEntity("apos", "'")
registerEntity("OElig", "Œ")
registerEntity("oelig", "œ")
registerEntity("Scaron", "Š")
registerEntity("scaron", "š")
registerEntity("Yuml", "Ÿ")
registerEntity("circ", "ˆ")
registerEntity("tilde", "˜")
registerEntity("ensp", " ")
registerEntity("emsp", " ")
registerEntity("thinsp", " ")
registerEntity("zwnj", "")
registerEntity("zwj", "")
registerEntity("lrm", "")
registerEntity("rlm", "")
registerEntity("ndash", "–")
registerEntity("mdash", "—")
registerEntity("lsquo", "‘")
registerEntity("rsquo", "’")
registerEntity("sbquo", "‚")
registerEntity("ldquo", "“")
registerEntity("rdquo", "”")
registerEntity("bdquo", "„")
registerEntity("dagger", "†")
registerEntity("Dagger", "‡")
registerEntity("permil", "‰")
registerEntity("lsaquo", "‹")
registerEntity("rsaquo", "›")
registerEntity("euro", "€")
end
## parse token
def nextToken
token = ''
if @xhtmlp
elementpat = /[\<\>\[\]\=\/]/u
else
elementpat = /[\<\>\[\]\=]/u
end
while !(c = @content[@pos, 1]).nil?
if c == ''
## EOF
return token if token != ''
return nil
elsif c == '-' && token == '/u, @pos + 1)
raise ParseError.new("comment parse error") unless commentpos
@content[commentpos..-1] =~ /--[ \t\n\r]>/u
len = $&.length
token += @content[@pos, commentpos - @pos + len]
@pos = commentpos + len
return token
elsif c == '-' && token == '-'
## Comment in decl
commentpos = @content.index(/--/u, @pos + 1)
raise ParseError.new("comment parse error") unless commentpos
token += @content[@pos, commentpos - @pos + 2]
@pos = commentpos + 2
return token
elsif c == '?' && token == '<'
## PI
pipos = @content.index("?>", @pos + 1)
raise ParseError.new("PI parse error") unless pipos
token += @content[@pos, pipos - @pos + 2]
@pos = pipos + 2
return token
elsif c =~ /[ \t\n\r]/u
## White Space
return token if token != ''
@pos += 1
next
elsif c =~ elementpat
## Element
return token if token != ''
if c == '=' || c == '>'
@pos += 1
return c
end
@pos += 1
token = c
next
## Literal
elsif token == '' && (c == '"' || c == "'")
quotpos = @content.index(c, @pos + 1)
raise ParseError.new("literal parse error") unless quotpos
token = @content[@pos, quotpos - @pos + 1]
@pos = quotpos + 1
return token
## Others
else
token += c
@pos += 1
next
end
end
nil
end
def checkNameChar(str)
str =~ /\A([^\W0-9]|:)[\w\.\-:]*\Z/u
end
## parse DTD
def parseDTD(dtd)
@pos -= dtd.length
start = @pos
if (token = nextToken) != '' && token != '['
extid = token
token = nextToken
end
end
## skip internel DTD subset
if token == '['
while (token = nextToken)
if token == ']'
token = nextToken
break
end
end
end
if token != '>'
raise ParseError.new("DOCTYPE parse error")
end
if !@forceHTML && pubid =~ /^[\"\']-\/\/W3C\/\/DTD XHTML /
@xhtmlp = true
end
# p [doctype, pubid, extid]
@content[start + 1, @pos - start - 2] ## chop the first '<' and
## the last '>'
end
def isEmptyElement(name)
return false if @xhtmlp
name =~ /^(br|area|link|img|param|hr|input|col|base|meta|basefont|frame|isindex)$/
end
def isCdataElement(name)
return false if @xhtmlp
name =~ /^(style|script)$/
end
## parse Element start tag
def parseElementStartTag(elem)
empty = nil
attrs = {}
rawattrs = {}
## rewind
@pos -= elem.length
start = @pos
name = nextToken
if !checkNameChar(name)
## rollback
@pos = start
return nil
# raise ParseError.new("illegal element name: #{name.inspect}")
end
name.downcase! unless @xhtmlp
token = nextToken
while !token.nil?
break if token == '>'
if token == '/' ## empty element tag
token = nextToken
if token != '>'
## rollback
@pos = start
return nil
# raise ParseError.new("element parse error")
end
empty = 1
break
end
attrname = token
# if !checkNameChar(attrname)
# raise ParseError.new("illegal attribute name: #{attrname.inspect}")
# end
attrname.downcase! unless @xhtmlp
token = nextToken
if token != '='
raise ParseError.new("attribute parse error") if @xhtmlp
attrvalue = attrname
if ATTR_NAME[name]
for n, v in ATTR_NAME[name]
if attrvalue =~ v
attrname = n
break
end
end
end
else
attrvalue = nextToken
token = nextToken
end
if attrvalue !~ /\A([\'\"]?)([\w\W]*)\1\Z/
raise ParseError.new("attribute parse error: #{attrvalue.inspect}")
end
# if attrs.include?(attrname)
# raise ParseError.new("dupulicate attribute: #{attrname.inspect}")
# end
attrs[attrname] = expandAttrValue($2)
if @eliminateWhiteSpace
attrs[attrname].gsub!(/[ \x9\n]+/, ' ')
attrs[attrname].gsub!(/\A +| +\z/, '')
end
rawattrs[attrname] = attrvalue
end
empty = 1 if isEmptyElement(name)
[name, attrs, empty, rawattrs]
end
def expect(key, include = 0)
token = nil
pos = @content.index(key, @pos)
if pos.nil?
token = @content[@pos..-1]
@pos = -1
return token
elsif key.is_a?(Regexp) && include > 0
@content[pos..-1] =~ key
include = $&.length
end
token = @content[@pos, pos - @pos + include]
@pos = pos + include
token
end
def parseTag(e = nil)
c = @content[@pos, 1]
if !e.nil?
token = expect(Parser.regnew(e, 'i'), 0)
return [:CDATA, token]
elsif c == '<'
## Markup
token = expect(">", 1)
if token[-1, 1] != '>'
return [:MARKUP, nil]
end
return [:MARKUP, token]
else
## CharData
return [:PCDATA, expect("<")]
end
end
def normalizeLineBreak(str)
return nil unless str
if str.respond_to?(:encoding)
org_enc = str.encoding
str.force_encoding(::Encoding::ASCII_8BIT)
str.gsub(/\x0d\x0a|\x0d/, "\x0a")
str.force_encoding(org_enc)
else
str.gsub(/\x0d\x0a|\x0d/u, "\x0a")
end
end
def checkContent(parent, child)
return true unless ContentList.include?(child)
return true unless ContentList.include?(parent)
return true if child =~ ContentList[parent]
false
end
def doPreParseProcessing
## normalize line break
@content[@pos..-1] = normalizeLineBreak(@content[@pos..-1])
end
public
attr_accessor :eliminateWhiteSpace
attr_accessor :forceHTML
ZenkakuChar = [0x3000, MINUS, 0x9fff, 0xf900, MINUS, 0xfaff]
IgnorableSpaces = regnew("([#{ZenkakuChar.pack('U*')}])\n+([#{ZenkakuChar.pack('U*')}])")
def parse(content, &block)
@content = content
if !content.is_a?(InputStream) && content.respond_to?(:read)
## IO stream
@content = InputStream.new(content)
elsif content.is_a?(String) && content.respond_to?(:encoding)
## Ruby1.9 String
if @content.encoding != ::Encoding::UTF_8
@content = @content.encode(::Encoding::UTF_8)
end
if !@content.valid_encoding?
raise EncodingError.new("invalid encoding")
end
end
@block = block
@pos = 0
estack = []
if @content.nil?
return 0
end
if @encoding && @content.is_a?(InputStream)
@content.setEncoding(@encoding)
end
lastContent = nil
nextContent = nil
while @pos >= 0
ttype, part = parseTag(nextContent)
oldpart = part
nextContent = nil
if part.nil?
raise ParseError.new("unexpected EOF")
elsif ttype == :PCDATA
## #PCDATA
if !lastContent
doPreParseProcessing
part = normalizeLineBreak(part)
end
## unknown encoding non-ascii characters
if part.respond_to?(:encoding) and
!part.ascii_only? and part.encoding == ::Encoding::ASCII_8BIT
raise ParseError.new("character encoding has not been specified")
end
lastContent = :PCDATA
if estack.length == 0
next if part =~ /\A[ \x9\r\n]*\Z/u
raise ParseError.new("cdata must be in document element: #{part.inspect}")
end
if !havePCDATA?(estack[-1])
next if part =~ /\A[ \x9\r\n]*\Z/u
# raise ParseError.new("cannot have #PCDATA in #{estack[-1]}")
end
part = expandRef(part)
if @eliminateWhiteSpace && estack[-1] != 'pre'
part.gsub!(IgnorableSpaces, '\1\2')
part.gsub!(/[ \x9\n]+/, ' ')
part.gsub!(/\A +| +\z/, '')
end
if part != ''
if block_given?
@block.call(:CDATA, nil, part)
else
character(part)
end
end
next
elsif ttype == :CDATA
lastContent = :CDATA
## CDATA
if block_given?
@block.call(:CDATA, nil, part)
else
character(part)
end
next
else
first = part[1]
if first == ?? && part =~ /\A<\?xml[ \t\n\r\?]/u && lastContent.nil?
## XML Declaration
if (part =~ /\A<\?xml([ \t\n\r]+version[ \t\n\r]*=[ \t\n\r]*(['"])([a-zA-Z0-9_.:\-]+)\2)?([ \t\n\r]+encoding[ \t\n\r]*=[ \t\n\r]*(['"])(.*?)\5)?([ \t\n\r]+standalone[ \t\n\r]*=[ \t\n\r]*(['"])(yes|no)\8)?[ \t\n\r]*\?>/u) != 0
raise ParseError.new("illegal XML declaration")
end
@xhtmlp = true if !@forceHTML
version = $3
encoding = $6
standalone = $9
if !version
raise ParseError.new("invalid XML declaration")
end
if version != '1.0' && version != '1.1'
raise ParseError.new("version #{version} not supported")
end
if block_given?
@block.call(:XML_DECL, nil, [version, encoding, standalone])
else
xmlDecl(version, encoding, standalone)
end
if encoding && @content.is_a?(InputStream)
@content.setEncoding(encoding.downcase)
end
next
end
## pre-parse processing after XML Declaration
if !lastContent ||
lastContent == :XML_DECL
doPreParseProcessing
part = normalizeLineBreak(part)
next if lastContent
end
if first == ??
## Processing Instruction
lastContent = :PI
if part !~ /\?>\Z/u
part += expect("?>", 2)
if part !~ /\?>\Z/u
raise ParseError.new("processing instruction data expected")
end
end
part = part[2..-3] ## strip "" and "?>"
part =~ /\A([^ \t\n\r]+)([ \t\n\r]+(.*))?\Z/mu
name = $1
data = $3.to_s
if @xhtmlp && name =~ /\Axml\z/i
raise ParseError.new("illegal PI name: #{name.inspect}")
end
##!!! chack name
if block_given?
@block.call(:PI, name, data)
else
processingInstruction(name, data)
end
next
elsif first == ?!
if part =~ /\A\Z/u
raise ParseError.new("comment must end with \"-->\"")
end
part =~ /\A