#! /usr/local/bin/ruby -Ku ## ymXML - Simple XML Parser ## (c) 1998-2007 yoshidam ## You can redistribute it and/or modify it under the same term as Ruby. ## ## Nov 17, 2007 yoshidam version 0.5.3 Windows-1252 ## Oct 30, 2006 yoshidam version 0.5.2 NKF, Iconv ## Apr 23, 2004 yoshidam version 0.5.1 parameter entity ## Apr 14, 2004 yoshidam version 0.5.0 parameter entity ## Apr 6, 2004 yoshidam version 0.4.9 attribute value normalization ## Apr 2, 2004 yoshidam version 0.4.8 recursive entity reference ## Mar 31, 2004 yoshidam version 0.4.7 YmXML::InputStream fix ## Mar 29, 2004 yoshidam version 0.4.6 YmXML::Listener, bug fix ## Mar 19, 2004 yoshidam version 0.4.4 WF check, bug fix ## Mar 18, 2004 yoshidam version 0.4.3 namespace check, bug fix ## Feb 28, 2004 yoshidam version 0.4.2 external entity URI ## Feb 9, 2004 yoshidam version 0.4.1 bug fix ## Feb 5, 2004 yoshidam version 0.4.0 partial support of XML 1.1 ## Jan 4, 2004 yoshidam version 0.3.7 UTF-8 with BOM ## Jan 4, 2004 yoshidam version 0.3.6 exception, UTF-16LE/BE, UTF-32LE/BE ## Sep 17, 2003 yoshidam version 0.3.5 character class, surrogate pair ## Jul 25, 2003 yoshidam version 0.3.4 bug fix ## Jul 9, 2003 yoshidam version 0.3.3 externalEntityRef, entityDecl, ## and notationDecl ## Jun 23, 2003 yoshidam version 0.3.2 createPartialParser, parseRef ## Jun 13, 2003 yoshidam version 0.3.1 attlistDecl, elementDecl ## Mar 25, 2003 yoshidam version 0.3.0 character encodings , namespaces ## and attribute defaults ## Mar 14, 2003 yoshidam version 0.2.1 bug fix ## Jan 7, 1999 yoshidam version 0.2 rewriten with Ruby ## Apr 10, 1998 yoshidam version 0.1 writen with Perl module YmXML VERSION = 0.503 class Error < StandardError end class ParseError < Error end class ParseStopped < ParseError end class EncodingError < Error end module CharClass def self.ary2cc(ary) ret = '' ary.each do |s| case s when Integer ret << Regexp.quote([s].pack('U'), 'u') when Range f = s.first t = s.last case f when Integer ret << Regexp.quote([f].pack('U'), 'u') + '-' + Regexp.quote([t].pack('U'), 'u') else raise ParseError.new("not char: #{s.inspect}") end else raise ParseError.new("not char: #{s.inspect}") end end ret end BaseChar = [ 0x0041..0x005A, 0x0061..0x007A, 0x00C0..0x00D6, 0x00D8..0x00F6, 0x00F8..0x00FF, 0x0100..0x0131, 0x0134..0x013E, # 0x00D8..0x00F6, 0x00F8..0x00FD, 0x00FE..0x00FF, 0x0100..0x0131, 0x0134..0x013E, 0x0141..0x0148, 0x014A..0x017E, 0x0180..0x01C3, 0x01CD..0x01F0, 0x01F4..0x01F5, 0x01FA..0x0217, 0x0250..0x02A8, 0x02BB..0x02C1, 0x0386, 0x0388..0x038A, 0x038C, 0x038E..0x03A1, 0x03A3..0x03CE, 0x03D0..0x03D6, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x03E2..0x03F3, 0x0401..0x040C, 0x040E..0x044F, 0x0451..0x045C, 0x045E..0x0481, 0x0490..0x04C4, 0x04C7..0x04C8, 0x04CB..0x04CC, 0x04D0..0x04EB, 0x04EE..0x04F5, 0x04F8..0x04F9, 0x0531..0x0556, 0x0559, 0x0561..0x0586, 0x05D0..0x05EA, 0x05F0..0x05F2, 0x0621..0x063A, 0x0641..0x064A, 0x0671..0x06B7, 0x06BA..0x06BE, 0x06C0..0x06CE, 0x06D0..0x06D3, 0x06D5, 0x06E5..0x06E6, 0x0905..0x0939, 0x093D, 0x0958..0x0961, 0x0985..0x098C, 0x098F..0x0990, 0x0993..0x09A8, 0x09AA..0x09B0, 0x09B2, 0x09B6..0x09B9, 0x09DC..0x09DD, 0x09DF..0x09E1, 0x09F0..0x09F1, 0x0A05..0x0A0A, 0x0A0F..0x0A10, 0x0A13..0x0A28, 0x0A2A..0x0A30, 0x0A32..0x0A33, 0x0A35..0x0A36, 0x0A38..0x0A39, 0x0A59..0x0A5C, 0x0A5E, 0x0A72..0x0A74, 0x0A85..0x0A8B, 0x0A8D, 0x0A8F..0x0A91, 0x0A93..0x0AA8, 0x0AAA..0x0AB0, 0x0AB2..0x0AB3, 0x0AB5..0x0AB9, 0x0ABD, 0x0AE0, 0x0B05..0x0B0C, 0x0B0F..0x0B10, 0x0B13..0x0B28, 0x0B2A..0x0B30, 0x0B32..0x0B33, 0x0B36..0x0B39, 0x0B3D, 0x0B5C..0x0B5D, 0x0B5F..0x0B61, 0x0B85..0x0B8A, 0x0B8E..0x0B90, 0x0B92..0x0B95, 0x0B99..0x0B9A, 0x0B9C, 0x0B9E..0x0B9F, 0x0BA3..0x0BA4, 0x0BA8..0x0BAA, 0x0BAE..0x0BB5, 0x0BB7..0x0BB9, 0x0C05..0x0C0C, 0x0C0E..0x0C10, 0x0C12..0x0C28, 0x0C2A..0x0C33, 0x0C35..0x0C39, 0x0C60..0x0C61, 0x0C85..0x0C8C, 0x0C8E..0x0C90, 0x0C92..0x0CA8, 0x0CAA..0x0CB3, 0x0CB5..0x0CB9, 0x0CDE, 0x0CE0..0x0CE1, 0x0D05..0x0D0C, 0x0D0E..0x0D10, 0x0D12..0x0D28, 0x0D2A..0x0D39, 0x0D60..0x0D61, 0x0E01..0x0E2E, 0x0E30, 0x0E32..0x0E33, 0x0E40..0x0E45, 0x0E81..0x0E82, 0x0E84, 0x0E87..0x0E88, 0x0E8A, 0x0E8D, 0x0E94..0x0E97, 0x0E99..0x0E9F, 0x0EA1..0x0EA3, 0x0EA5, 0x0EA7, 0x0EAA..0x0EAB, 0x0EAD..0x0EAE, 0x0EB0, 0x0EB2..0x0EB3, 0x0EBD, 0x0EC0..0x0EC4, 0x0F40..0x0F47, 0x0F49..0x0F69, 0x10A0..0x10C5, 0x10D0..0x10F6, 0x1100, 0x1102..0x1103, 0x1105..0x1107, 0x1109, 0x110B..0x110C, 0x110E..0x1112, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E, 0x1150, 0x1154..0x1155, 0x1159, 0x115F..0x1161, 0x1163, 0x1165, 0x1167, 0x1169, 0x116D..0x116E, 0x1172..0x1173, 0x1175, 0x119E, 0x11A8, 0x11AB, 0x11AE..0x11AF, 0x11B7..0x11B8, 0x11BA, 0x11BC..0x11C2, 0x11EB, 0x11F0, 0x11F9, 0x1E00..0x1E9B, 0x1EA0..0x1EF9, 0x1F00..0x1F15, 0x1F18..0x1F1D, 0x1F20..0x1F45, 0x1F48..0x1F4D, 0x1F50..0x1F57, 0x1F59, 0x1F5B, 0x1F5D, 0x1F5F..0x1F7D, 0x1F80..0x1FB4, 0x1FB6..0x1FBC, 0x1FBE, 0x1FC2..0x1FC4, 0x1FC6..0x1FCC, 0x1FD0..0x1FD3, 0x1FD6..0x1FDB, 0x1FE0..0x1FEC, 0x1FF2..0x1FF4, 0x1FF6..0x1FFC, 0x2126, 0x212A..0x212B, 0x212E, 0x2180..0x2182, 0x3041..0x3094, 0x30A1..0x30FA, 0x3105..0x312C, 0xAC00..0xD7A3 ] Ideographic = [ 0x4E00..0x9FA5, 0x3007, 0x3021..0x3029 ] CombiningChar = [ 0x0300..0x0345, 0x0360..0x0361, 0x0483..0x0486, 0x0591..0x05A1, 0x05A3..0x05B9, 0x05BB..0x05BD, 0x05BF, 0x05C1..0x05C2, 0x05C4, 0x064B..0x0652, 0x0670, 0x06D6..0x06DC, 0x06DD..0x06DF, 0x06E0..0x06E4, 0x06E7..0x06E8, 0x06EA..0x06ED, 0x0901..0x0903, 0x093C, 0x093E..0x094C, 0x094D, 0x0951..0x0954, 0x0962..0x0963, 0x0981..0x0983, 0x09BC, 0x09BE, 0x09BF, 0x09C0..0x09C4, 0x09C7..0x09C8, 0x09CB..0x09CD, 0x09D7, 0x09E2..0x09E3, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0A40..0x0A42, 0x0A47..0x0A48, 0x0A4B..0x0A4D, 0x0A70..0x0A71, 0x0A81..0x0A83, 0x0ABC, 0x0ABE..0x0AC5, 0x0AC7..0x0AC9, 0x0ACB..0x0ACD, 0x0B01..0x0B03, 0x0B3C, 0x0B3E..0x0B43, 0x0B47..0x0B48, 0x0B4B..0x0B4D, 0x0B56..0x0B57, 0x0B82..0x0B83, 0x0BBE..0x0BC2, 0x0BC6..0x0BC8, 0x0BCA..0x0BCD, 0x0BD7, 0x0C01..0x0C03, 0x0C3E..0x0C44, 0x0C46..0x0C48, 0x0C4A..0x0C4D, 0x0C55..0x0C56, 0x0C82..0x0C83, 0x0CBE..0x0CC4, 0x0CC6..0x0CC8, 0x0CCA..0x0CCD, 0x0CD5..0x0CD6, 0x0D02..0x0D03, 0x0D3E..0x0D43, 0x0D46..0x0D48, 0x0D4A..0x0D4D, 0x0D57, 0x0E31, 0x0E34..0x0E3A, 0x0E47..0x0E4E, 0x0EB1, 0x0EB4..0x0EB9, 0x0EBB..0x0EBC, 0x0EC8..0x0ECD, 0x0F18..0x0F19, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F, 0x0F71..0x0F84, 0x0F86..0x0F8B, 0x0F90..0x0F95, 0x0F97, 0x0F99..0x0FAD, 0x0FB1..0x0FB7, 0x0FB9, 0x20D0..0x20DC, 0x20E1, 0x302A..0x302F, 0x3099, 0x309A ] Digit = [ 0x0030..0x0039, 0x0660..0x0669, 0x06F0..0x06F9, 0x0966..0x096F, 0x09E6..0x09EF, 0x0A66..0x0A6F, 0x0AE6..0x0AEF, 0x0B66..0x0B6F, 0x0BE7..0x0BEF, 0x0C66..0x0C6F, 0x0CE6..0x0CEF, 0x0D66..0x0D6F, 0x0E50..0x0E59, 0x0ED0..0x0ED9, 0x0F20..0x0F29 ] Extender = [ 0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005, 0x3031..0x3035, 0x309D..0x309E, 0x30FC..0x30FE ] Letter = BaseChar + Ideographic NCNameChar = Letter + Digit + [?., ?-, ?_] + CombiningChar + Extender NameChar = Letter + Digit + [?., ?-, ?_, ?:] + CombiningChar + Extender NCNameStart = Letter + [?_] NCName_ = '[' + ary2cc(NCNameStart) + '][' + ary2cc(NCNameChar) + ']*' Name_ = '[' + ary2cc(Letter) + '_:][' + ary2cc(NameChar) + ']*' Nmtoken_ = '[' + ary2cc(NameChar) + ']+' NCName = Regexp.new('\A' + NCName_ + '\z', nil, 'u') Name = Regexp.new('\A' + Name_ + '\z', nil, 'u') Nmtoken = Regexp.new('\A' + Nmtoken_ + '\z', nil, 'u') QName = Regexp.new('\A(' + NCName_ + ':)?' + NCName_ + '\z', nil, 'u') Char = Regexp.new('\A[' + [0x9, 0xa, 0xd, 0x20, ?-, 0x7f].pack('U*') + [0x80, ?-, 0x7ff].pack('U*') + [0x800, ?-, 0xd7ff].pack('U*') + [0xe000, ?-, 0xfffd].pack('U*') + [0x10000, ?-, 0x10ffff].pack('U*') + ']*\z', false, 'u') PubidLiteral = Regexp.new('\A[\x0d\x0a a-zA-Z0-9\-\'\(\)\+\,\./:=\?;\!\*#@$_%]*\z', nil, 'u') NameStartChar11 = [ ?:, ?A..?Z, ?_, ?a..?z, 0x00C0..0x00D6, 0x00D8..0x00F6, 0x00F8..0x02FF, 0x0370..0x037D, 0x037F..0x07FF, 0x0800..0x1FFF, 0x200C..0x200D, 0x2070..0x218F, 0x2C00..0x2FEF, 0x3001..0xD7FF, 0xF900..0xFDCF, 0xFDF0..0xFFFD, 0x10000..0xEFFFF ] NameChar11 = NameStartChar11 + [ ?-, ?., ?0..?9, 0x00B7, 0x0300..0x036F, 0x203F..0x2040 ] Name11_ = '[' + ary2cc(NameStartChar11) + '][' + ary2cc(NameChar11) + ']*' Nmtoken11_ = '[' + ary2cc(NameChar11) + ']+' NCName11_ = '[' + ary2cc(NameStartChar11 - [?:]) + '][' + ary2cc(NameChar11 - [?:]) + ']*' Name11 = Regexp.new('\A' + Name11_ + '\z', nil, 'u') Nmtoken11 = Regexp.new('\A' + Nmtoken11_ + '\z', nil, 'u') NCName11 = Regexp.new('\A' + NCName11_ + '\z', nil, 'u') QName11 = Regexp.new('\A(' + NCName11_ + ':)?' + NCName11_ + '\z', nil, 'u') ## XML 1.1 Char - RestrictedChar RestrictedChar11 = Regexp.new('\A[' + [0x9, 0xa, 0xd, 0x20, ?-, 0x7e].pack('U*') + [0x85, 0xa0, ?-, 0x7ff].pack('U*') + [0x800, ?-, 0xd7ff].pack('U*') + [0xe000, ?-, 0xfffd].pack('U*') + [0x10000, ?-, 0x10ffff].pack('U*') + ']*\z', false, 'u') Char11 = Regexp.new('\A[' + [0x1, ?-, 0x7f].pack('U*') + [0x80, ?-, 0x7ff].pack('U*') + [0x800, ?-, 0xd7ff].pack('U*') + [0xe000, ?-, 0xfffd].pack('U*') + [0x10000, ?-, 0x10ffff].pack('U*') + ']*\z', false, 'u') end class Parser XMLNS = 'http://www.w3.org/XML/1998/namespace' private def normalizeAttrValue(str) str.gsub(/[\x9\r\n]/u, ' ') end def normalizeAttrTokenValue(str) str.gsub(/\A +| +\z/u, '').gsub(/ +/, ' ') end def expandNumericCharRef(ref) if ref =~ /^\&\#(\d+);$/u ## Numeric Character Reference (Decimal) char = $1.to_i elsif ref =~ /^\&\#x([0-9a-fA-F]+);$/u ## Numeric Character Reference (Hexadecimal) char = $1.hex else raise ParseError.new("illegal numeric character reference: #{ref.inspect}") end if char > 0x10ffff || !checkChar((ref = [char].pack("U")), false) raise ParseError.new("invalid character: #{ref.inspect}") end ref end ## expand the entity value of entity declarations def expandEntityValue(text = nil, external = false) return '' if text.nil? ## parameter entity references and '%' characters are not allowed ## in an internal subset if text.index('%') && !external raise ParseError.new("invalid token: #{text}") end ret = [] ret.taint if text.tainted? while text =~ /(\&|%)([^ \t\n\r]+?\;)?/u before = Regexp.last_match.pre_match ref = Regexp.last_match[0] text = Regexp.last_match.post_match ret.push(before) if before != '' if ref == '&' || ref == '%' raise ParseError.new("invalid token: #{ref.inspect}") elsif ref =~ /^\&\#.*;$/u ## Numeric Character Reference ref = expandNumericCharRef(ref) elsif ref =~ /^%.*;$/u ## Parameter Character Reference val = getEntity(ref) if val.is_a?(Array) ## External Parameter Entity Reference if @block @block.call(:EXTERNAL_ENTITY_REF, nil, [@base, val[1], val[0]], self) else externalEntityRef(nil, @base, val[1], val[0]) end # raise ParseError.new("reference to external entity in entityDef") end @entityContext.push(ref) ref = expandEntityValue(val) ## expand recursively @entityContext.pop else ## Entity Reference if !checkNameChar(ref.gsub(/\A\&(.+);\z/u, '\1')) raise ParseError.new("invalid entity reference: #{ref.inspect}") end if @nssep && ref.index(':') raise ParseError.new("illegal entity reference: #{ref.inspect}") end end ret.push(ref) end ## end of while ret.push(text) if text != '' ret.join('') end ## parse entityRef/charRef in content text def parseRef(text = nil, &block) return '' if text.nil? ret = [] ret.taint if text.tainted? while text =~ /\&([^ \t\n\r]+?\;)?/u before = Regexp.last_match.pre_match ref = Regexp.last_match[0] text = Regexp.last_match.post_match ret.push(before) if before != '' if ref == '&' raise ParseError.new("invalid token: #{ref.inspect}") elsif ref =~ /^\&\#.*;$/u ## Numeric Character Reference ref = expandNumericCharRef(ref) ret.push(ref) else ## Entity Reference if !checkNameChar(ref.gsub(/\A\&(.+);\z/u, '\1')) raise ParseError.new("illegal entity reference: #{ref.inspect}") end if @nssep && ref.index(':') raise ParseError.new("illegal entity reference: #{ref.inspect}") end if ret.length > 0 if block_given? @block.call(:CDATA, nil, ret.join(''), self) else character(ret.join('')) end ret = [] end if @entityContext.include?(ref) raise ParseError.new("recursive entity reference: #{ref} in #{@entityContext.reverse.join(' in ')}") end entity = getEntity(ref) @entityContext.push(ref) if entity.is_a?(Array) ## external entity if entity[2] raise ParseError.new("reference to binary entity") end if block_given? @block.call(:EXTERNAL_ENTITY_REF, ref, [@base, entity[1], entity[0]], self) else externalEntityRef(ref, @base, entity[1], entity[0]) end elsif entity.nil? if block_given? @block.call(:SKIPPED_ENTITY, ref, 0, self) else skippedEntity(ref, 0) end else pp = createPartialParser(nil) pp.parse(entity, &@block) end @entityContext.pop end end ## end of while ret.push(text) if text != '' if ret.length > 0 if block_given? @block.call(:CDATA, nil, ret.join(''), self) else character(ret.join('')) end end end ## expand entityRef/charRef in attribute value/attlist default value def expandAttrValue(text = nil) return '' if text.nil? raise ParseError.new("invalid token: #{text}") if text.index('<') ret = [] ret.taint if text.tainted? text = normalizeAttrValue(text) while text =~ /\&([^ \t\n\r]+?\;)?/u before = Regexp.last_match.pre_match ref = Regexp.last_match[0] text = Regexp.last_match.post_match ret.push(before) if before != '' if ref == '&' raise ParseError.new("invalid token: #{ref}") elsif ref =~ /^\&\#.*;$/u ## Numeric Character Reference ref = expandNumericCharRef(ref) else ## Entity Reference if !checkNameChar(ref.gsub(/\A\&(.+);\z/u, '\1')) raise ParseError.new("illegal entity reference: #{ref.inspect}") end if @nssep && ref.index(':') raise ParseError.new("illegal entity reference: #{ref.inspect}") end if @entityContext.include?(ref) raise ParseError.new("recursive entity reference: #{ref} in #{@entityContext.reverse.join(' in ')}") end val = getEntity(ref) if val.is_a?(Array) raise ParseError.new("reference to external entity in attribute") end @entityContext.push(ref) ref = expandAttrValue(val) ## expand recursively @entityContext.pop end ret.push(ref) end ## end of while ret.push(text) if text != '' ret.join('') end def registerEntity(entname, entval) if @entity[entname].nil? @entity[entname] = entval return true end return false end def getEntity(entname) entname = entname.sub(/^\&?(.+)\;$/u, '\1') if !@entity[entname].nil? return @entity[entname] end if @noDTD || @onlyInternalSubset || @standalone raise ParseError.new("undeclared entity reference: #{entname.inspect}") end nil end def initialize(encoding = nil, nssep = nil) @content = '' @pos = -1 @entity = {} @encoding = encoding ? encoding.downcase : nil @nssep = nssep @ns = [{nil=>nil, 'xml'=>XMLNS}] @attlist = {} @parent = nil @standalone = false @noDTD = true @onlyInternalSubset = true @paramEntRef = false ## XML version @version = "1.0" ## external entity context @context = nil @base = nil ## specified attribute names @specifiedAttrs = nil @entityContext = [] @tokenQueue = [] registerEntity('lt', '<') registerEntity('gt', '>') registerEntity('quot', '"') registerEntity('apos', ''') registerEntity('amp', '&') end def nextToken2(content, pos) token = '' foundWS = false while pos >= 0 && !(c = content[pos, 1]).nil? if c == '' ## EOF return [token, pos] if token != '' token = nil return [token, pos] elsif c == '-' && token == '", pos + 1) if commentpos token += content[pos, commentpos - pos + 3] pos = commentpos + 3 else token += content[pos..-1] pos = -1 end return [token, pos] elsif c == '?' && token == '<' ## PI pipos = content.index("?>", pos + 1) if pipos token += content[pos, pipos - pos + 2] pos = pipos + 2 else token += content[pos..-1] pos = -1 end return [token, pos] elsif c =~ /[ \t\n\r]/u ## White Space return [token, pos] if token != '' foundWS = true token = ' ' pos += 1 next elsif c == ';' pos += 1 return [token + c, pos] elsif c =~ /[\<\[\]\/]/u ## multiple symbol token return [token, pos] if token != '' pos += 1 token = c next elsif c =~ /[\>\=\(\)\|\,\?\*\+]/u ## single symbol token return [token, pos] if token != '' pos += 1 token = c return [token, pos] elsif token == '' && (c == '"' || c == "'") ## Literal quotpos = content.index(c, pos + 1) raise ParseError.new("literal parse error") unless quotpos token = content[pos, quotpos - pos + 1] pos = quotpos + 1 return [token, pos] else ## Others return [token, pos] if token == ' ' token += c pos += 1 next end end [nil, pos] end def queueTokens(content) value = getEntity(content) pos = 0 while true token, pos = nextToken2(" #{value} ", pos) return if !token if @parent && !@context && token =~ /\A%.+\;/ queueTokens(token) else @tokenQueue.push(token) end end end def nextToken1(content, pos) token, pos = nextToken2(content, pos) if @parent && !@context && token =~ /\A%.+\;\z/ queueTokens(token) return [@tokenQueue.shift, pos] else return [token, pos] end end ## parse token ## needWS = true: need whitespace(s) before token ## prohibitWS = true: prohibitWS whitespace(s) before token ## needWS = RE: need whitespace(s) before token except RE === token ## prohibitWS = RE : prohibitWS whitespace(s) before token ## when RE === token def nextToken(needWS = false, prohibitWS = false) token = '' foundWS = false begin while true if @tokenQueue.length > 0 token = @tokenQueue.shift else token, @pos = nextToken1(@content, @pos) end if token == ' ' foundWS = true next end return token end ensure if !foundWS && needWS && !(needWS === token) raise ParseError.new("not well-formed: #{token.inspect}") elsif foundWS && (prohibitWS == true || prohibitWS === token) raise ParseError.new("not well-formed: #{token.inspect}") end end nil end def checkChar(str, restricted = true) if @version == '1.0' # str =~ YmXML::CharClass::Char str.each do |l| return nil if l !~ YmXML::CharClass::Char end return 0 elsif @version == '1.1' if restricted # str =~ YmXML::CharClass::RestrictedChar11 str.each do |l| return nil if l !~ YmXML::CharClass::RestrictedChar11 end return 0 else # str =~ YmXML::CharClass::Char11 str.each do |l| return nil if l !~ YmXML::CharClass::Char11 end return 0 end end end def checkNameChar(str) ## str =~ /\A([^\W0-9]|:)[\w\.\-:]*\z/u if @version == '1.0' str =~ YmXML::CharClass::Name elsif @version == '1.1' str =~ YmXML::CharClass::Name11 end end def checkNmtoken(str) if @version == '1.0' str =~ YmXML::CharClass::Nmtoken elsif @version == '1.1' str =~ YmXML::CharClass::Nmtoken11 end end def checkQNameChar(str) if @version == '1.0' str =~ YmXML::CharClass::QName elsif @version == '1.1' str =~ YmXML::CharClass::QName11 end end def checkPubid(str) str =~ YmXML::CharClass::PubidLiteral end def checkEncName(str) str =~ /\A[A-Za-z][A-Za-z0-9\._\-]*\z/ end def parseContentModel(token) content_model = nil if token.nil? return [nil, nil] end if token == '>' return [token, nil] end if token == 'EMPTY' || token == 'ANY' content_model = [token, "", nil, nil] token = nextToken return [token, content_model] elsif token == '(' token = nextToken token, content = parseContentModel(token) return [token, nil] if !content if content[0] == 'MIXED' ## Mixed content_model = content if token == '|' ## (#PCDATA|name1|name2|...)* names = [] content_model[3] = names begin token = nextToken token, content = parseContentModel(token) return [token, nil] if !content || content[0] != 'NAME' names << content end while token == '|' return [token, nil] if token != ')' token = nextToken(false, /\A\*\z/) return [token, nil] if token != '*' content_model[1] = '*' content_model[3] = names token = nextToken return [token, content_model] elsif token == ')' ## (#PCDATA) or (#PCDATA)* token = nextToken(false, /\A\*\z/) if token == '*' content_model[1] = '*' token = nextToken end if token == ')' raise ParseError.new("syntax error: #{token.inspect}") end return [token, content_model] end return [token, nil] elsif content[0] == 'NAME' || content[0] == 'CHOICE' || content[0] == 'SEQ' ## CHOICE or SEQ if token == '?' || token == '*'|| token == '+' content[1] = token token = nextToken end cps = [content] content_model = ['SEQ', "", nil, cps] cptype = ',' if token == '|' content_model[0] = 'CHOICE' cptype = '|' end while token == cptype token = nextToken token, content = parseContentModel(token) return [token, nil] if !content if content[0] != 'NAME' && content[0] != 'CHOICE' && content[0] != 'SEQ' return [token, nil] end if token == '?' || token == '*'|| token == '+' content[1] = token token = nextToken end cps << content end return [token, nil] if token != ')' token = nextToken(false, /\A(\?|\*|\+)\z/) if token == '?' || token == '*'|| token == '+' content_model[1] = token token = nextToken end return [token, content_model] else return [token, nil] end elsif token == '#PCDATA' token = nextToken if token == '>' raise ParseError.new("syntax error: #{token.inspect}") end return [token, ['MIXED', '', nil, nil]] else if !checkNameChar(token) raise ParseError.new("illegal element name: #{token.inspect}") end if @nssep && !checkQNameChar(token) raise ParseError.new("illegal element qname: #{token.inspect}") end content_model = ['NAME', '', token, nil] token = nextToken(false, /\A(\?|\*|\+)\z/) return [token, content_model] end content_model end ## parse DTD def parseDTD(dtd, &block) @pos -= dtd.length start = @pos if (token = nextToken) != '' raise ParseError.new("DTD parse error") end ## external DTD subset ## if externalSubset[1] ## if block_given? ## @block.call(:EXTERNAL_ENTITY_REF, nil, ## [@base, externalSubset[1], externalSubset[0]], self) ## else ## externalEntityRef(nil, @base, externalSubset[1], externalSubset[1]) ## end ## end @content[start + 1, @pos - start - 2] ## chop the first '<' and ## the last '>' end def getNSURI(prefix) @ns.reverse_each do |n| if n.include?(prefix) ## invalidated prefix (XML 1.1) if prefix && !n[prefix] raise ParseError.new("undefined namespace: #{prefix.inspect}") end return n[prefix] end end raise ParseError.new("undefined namespace: #{prefix.inspect}") end def resolveElementQName(qname) qname =~ /^(([^ \t\n\r]+):)?([^ \t\n\r]+)$/u prefix, localpart = $2, $3 uri = getNSURI(prefix) [prefix, uri, localpart] end def resolveAttributeQName(qname) qname =~ /^(([^ \t\n\r]+):)?([^ \t\n\r]+)$/u prefix, localpart = $2, $3 uri = nil uri = getNSURI(prefix) if !prefix.nil? [prefix, uri, localpart] end def getNSAttrs(args, eliminateNSDecl = false) ns = {} newargs = {} args.each do |n, v| if !checkQNameChar(n) raise ParseError.new("illegal attribute qname: #{n.inspect}") end prefix, localpart = n.split(':') if prefix == 'xmlns' if localpart == 'xml' && v != XMLNS raise ParseError.new("illegal xml namespace") elsif localpart == 'xmlns' raise ParseError.new("illegal xmlns namespace") elsif @version != '1.1' && localpart && v == '' raise ParseError.new("illegal namespace URI") end ns[localpart] = v == '' ? nil : v next if eliminateNSDecl end newargs[n] = v end [ns, newargs] end def getAttributeDefault(name) ret = {} attlist = @attlist[name] return ret unless attlist attlist.each do |k, v| next if !v[1].is_a?(String) ret[k] = v[1] end ret end ## parse Element start tag def parseElementStartTag(elem) empty = nil attrs = {} rawattrs = [] ## rewind @pos -= elem.length start = @pos name = nextToken(false, true) if !checkNameChar(name) raise ParseError.new("illegal element name: #{name.inspect}") end while !(token = nextToken(/\A(>|\/)\z/)).nil? break if token == '>' if token == '/' ## empty element tag token = nextToken(false, true) if token != '>' raise ParseError.new("element parse error") end empty = 1 break end attrname = token if !checkNameChar(attrname) raise ParseError.new("illegal attribute name: #{attrname.inspect}") end token = nextToken if token != '=' raise ParseError.new("attribute parse error") end attrvalue = nextToken if attrvalue !~ /\A([\'\"])([\w\W]*)\1\z/u raise ParseError.new("attribute parse error") end if attrs.include?(attrname) raise ParseError.new("dupulicate attribute: #{attrname.inspect}") end attrs[attrname] = expandAttrValue($2) atttype = "CDATA" if @attlist[name] && @attlist[name][attrname] atttype = @attlist[name][attrname][0] end if atttype != "CDATA" attrs[attrname] = normalizeAttrTokenValue(attrs[attrname]) end rawattrs.push([attrname, attrvalue]) end [name, attrs, empty, rawattrs] end def expect(key, include = 0) token = nil pos = @content.index(key, @pos) if pos.nil? token = @content[@pos..-1] @pos = -1 return token end token = @content[@pos, pos - @pos + include] @pos = pos + include token end def parseTag token = nil c = @content[@pos, 1] if c == '<' ## Markup token = expect(">", 1) if token[-1, 1] != '>' return nil end else ## CharData token = expect("<") end token end def normalizeLineBreak(str) return nil unless str if @version == '1.0' str.gsub(/\x0d\x0a|\x0d/u, "\x0a") elsif @version == '1.1' str.gsub(/\x0d\x0a|\x0d#{[0x85].pack('U')}|#{[0x85].pack('U')}|#{[0x2028].pack('U')}|\x0d/u, "\x0a") end end def doPreParseProcessing ## check validity of chars if !checkChar(@content[0..-1]) raise ParseError.new("invalid character") end ## normalize line break if @pos >= 0 @content[@pos..-1] = normalizeLineBreak(@content[@pos..-1]) end end public ## parse XML def parse(content, &block) @content = content if !content.is_a?(InputStream) && content.respond_to?('read') @content = InputStream.new(content) end @block = block @pos = 0 estack = [] documentElement = nil ## empty entity if @parent && (@content.nil? || @content == '') return 0 end if @encoding && @content.is_a?(InputStream) @content.setEncoding(@encoding) end if !@context && !@parent if block_given? @block.call(:START_DOCUMENT, nil, nil, self) else startDocument() end end lastContent = nil while @pos >= 0 part = parseTag oldpart = part if part.nil? raise ParseError.new("unexpected EOF") elsif part !~ /\A') raise ParseError.new("invalid token: \"]]>\"") end parseRef(part, &block) next else first = part[1] if first == ?? && part =~ /\A<\?xml[ \t\n\r\?]/u ## XML/Text Declaration ## @parent == nil --- XML decl ## @parent != nil && ## @context == nil --- error ## @context != nil --- Text decl if (part =~ /\A<\?xml([ \t\n\r]+version[ \t\n\r]*=[ \t\n\r]*(['"])([a-zA-Z0-9_.:\-]+)\2)?([ \t\n\r]+encoding[ \t\n\r]*=[ \t\n\r]*(['"])(.*?)\5)?([ \t\n\r]+standalone[ \t\n\r]*=[ \t\n\r]*(['"])(yes|no)\8)?[ \t\n\r]*\?>/u) != 0 raise ParseError.new("illegal XML declaration") end version = $3 encoding = $6 standalone = $9 if lastContent || (@context.nil? && @parent) raise ParseError.new("xml declaration not at start of external entity") end if encoding && !checkEncName(encoding) raise ParseError.new("invalid encoding: #{encoding.inspect}") end standalone = if !standalone -1 elsif standalone == 'yes' 1 else 0 end if @parent ## Text Declaration if !encoding || standalone != -1 raise ParseError.new("invalid text declaration") end if version && version != '1.0' && version != '1.1' raise ParseError.new("version #{version} not supported") end lastContent = :TEXT_DECL else ## XML Declaration if !version raise ParseError.new("invalid XML declaration") end if version != '1.0' && version != '1.1' raise ParseError.new("version #{version} not supported") end if block_given? @block.call(:XML_DECL, nil, [version, encoding, standalone], self) else xmlDecl(version, encoding, standalone) end lastContent = :XML_DECL end if version @version = version end if standalone == 1 @standalone = true end if encoding && @content.is_a?(InputStream) @content.setEncoding(encoding.downcase) end end ## pre-parse processing after XML/Text Declaration if !lastContent || lastContent == :XML_DECL || lastContent == :TEXT_DECL doPreParseProcessing part = normalizeLineBreak(part) if lastContent lastContent = :PREPARSE next end end if first == ?? ## Processing Instruction lastContent = :PI if part !~ /\?>\z/u part += expect("?>", 2) if part !~ /\?>\z/u raise ParseError.new("processing instruction data expected") end end part = part[2..-3] ## strip "" part =~ /\A([^ \t\n\r]+)([ \t\n\r]+(.*))?\z/mu name = $1 data = $3.to_s if !checkNameChar(name) || name =~ /\Axml\z/i raise ParseError.new("illegal PI name: #{name.inspect}") end if @nssep && name.index(':') raise ParseError.new("illegal PI name: #{name.inspect}") end if block_given? @block.call(:PI, name, data, self) else processingInstruction(name, data) end next elsif first == ?! if part =~ /\A\z/u part += expect("-->", 3) if part !~ /-->\z/u raise ParseError.new("comment must end with \"-->\"") end end part = part[4..-4].to_s ## strip "" if part =~ /--/u || part =~ /\A-|-\z/u raise ParseError.new("comment must not contain '--'") end if block_given? @block.call(:COMMENT, nil, part, self) else comment(part) end next elsif part =~ /\A\z/u part += expect("]]>", 3) if part !~ /\]\]>\z/u raise ParseError.new("\"\"") end end part = part[9..-4] if block_given? @block.call(:START_CDATA, nil, nil, self) else startCdata() end if block_given? @block.call(:CDATA, nil, part, self) else character(part) end if block_given? @block.call(:END_CDATA, nil, nil, self) else endCdata() end next else raise ParseError.new("unknown markup: #{part.inspect}") end else if !@parent && documentElement && estack.length == 0 raise ParseError.new("junk after document element") end lastContent = :ELEMENT name = nil attrs = nil rawattrs = nil empty = nil endTagP = nil if part =~ /\A<\//u ## element end tag name = part name.sub!(/\A<\/([^ \t\n\r]+)[ \t\n\r]*>\z/u, '\1') ##!!! check name if !checkNameChar(name) raise ParseError.new("illegal element name: #{name.inspect}") end if @nssep && !checkQNameChar(name) raise ParseError.new("illegal element qname: #{name.inspect}") end endTagP = 1 if estack.length == 0 || name != estack.pop raise ParseError.new("not opened end tag: #{name.inspect}") end else ## element start tag name, attrs, empty, rawattrs = parseElementStartTag(oldpart[1..-1]) getAttributeDefault(name).each do |k, v| attrs[k] = v unless attrs.include?(k) end if !checkNameChar(name) raise ParseError.new("illegal element name: #{name.inspect}") end estack.push(name) if !empty if @nssep ns, attrs = getNSAttrs(attrs, true) @ns.push(ns) ns.each do |prefix, uri| if block_given? @block.call(:START_NAMESPACE_DECL, prefix, uri, self) else startNamespaceDecl(prefix, uri) end end if !checkQNameChar(name) raise ParseError.new("illegal element qname: #{name.inspect}") end prefix, uri, localpart = resolveElementQName(name) name = uri.to_s + @nssep + localpart + @nssep + prefix.to_s newattrs = {} namechk = {} @specifiedAttrs = [] rawattrs.each do |k, v| next if k =~ /^xmlns:|^xmlns$/u prefix, uri, localpart = resolveAttributeQName(k) k = uri.to_s + @nssep + localpart + @nssep + prefix.to_s @specifiedAttrs.push(k) end attrs.each do |k, v| prefix, uri, localpart = resolveAttributeQName(k) expname = uri.to_s + ", " + localpart k = uri.to_s + @nssep + localpart + @nssep + prefix.to_s newattrs[k] = v if namechk.include?(expname) raise ParseError.new("duplicate expanded attribute name: #{expname.inspect}") end namechk[expname] = v end attrs = newattrs end documentElement = name if !documentElement if block_given? @block.call(:START_ELEM, name, attrs, self) else startElement(name, attrs) end end if empty || endTagP if @nssep if endTagP prefix, uri, localpart = resolveElementQName(name) name = uri.to_s + @nssep + localpart + @nssep + prefix.to_s end ns = @ns.pop end if block_given? @block.call(:END_ELEM, name, nil, self) else endElement(name) end if @nssep ns.to_a.reverse.each do |prefix, uri| if block_given? @block.call(:END_NAMESPACE_DECL, prefix, nil, self) else endNamespaceDecl(prefix) end end end end next end end end @content = nil @pos = -1 if !@parent && !documentElement raise ParseError.new("no element found") end if estack.length != 0 raise ParseError.new("unclosed element: #{estack.pop.inspect}") end if !@context && !@parent if block_given? @block.call(:END_DOCUMENT, nil, nil, self) else endDocument() end end end ## stop to parse def stop(message = "stopped") raise ParseStopped.new(message) end def done ## dummy end def createChildParser(context, encoding = nil, nssep = @nssep, *args) parser = self.class.new(encoding, nssep, *args) entity = @entity ns = @ns attlist = @attlist parent = self base = @base entityContext = @entityContext noDTD = @noDTD onlyInternalSubset = @onlyInternalSubset standalone = @standalone paramEntRef = @paramEntRef parser.instance_eval do ## inherit the parent environments @context = context @entity = entity @ns = ns @attlist = attlist @parent = parent @base = base @entityContext = entityContext @noDTD = noDTD @onlyInternalSubset = onlyInternalSubset @standalone = standalone @paramEntRef = paramEntRef end parser end def createPartialParser(context, encoding = nil, nssep = @nssep, *args) parser = createChildParser(context, encoding, nssep, *args) def parser.character(text) @parent.character(text) end def parser.xmlDecl(version, encoding, standalone) @parent.xmlDecl(version, encoding, standalone) end def parser.processingInstruction(name, data) @parent.processingInstruction(name, data) end def parser.comment(data) @parent.comment(data) end def parser.startElement(name, attrs) @parent.startElement(name, attrs) end def parser.endElement(name) @parent.endElement(name) end def parser.startNamespaceDecl(prefix, uri) @parent.startNamespaceDecl(prefix, uri) end def parser.endNamespaceDecl(prefix) @parent.endNamespaceDecl(prefix) end def parser.elementDecl(name, model) @parent.elementDecl(name, model) end def parser.attlistDecl(elname, attname, att_type, dflt, isrequired) @parent.attlistDecl(elname, attname, att_type, dflt, isrequired) end def parser.notationDecl(notationName, base, systemId, publicId) @parent.notationDecl(notationName, @base, systemId, publicId) end def parser.entityDecl(entityName, isparameter_entity, value, base, systenId, publicId, notationName) @parent.entityDecl(entityName, isparameter_entity, value, @base, systenId, publicId, notationName) end def parser.externalEntityRef(context, base, systemId, publicId) @parent.externalEntityRef(context, @base, systemId, publicId) end def parser.skippedEntity(name, isParameterEntity) @parent.skippedEntity(name, isParameterEntity) end def parser.startCdata() @parent.startCdata() end def parser.endCdata() @parent.endCdata() end def parser.startDocument() @parent.startDocument() end def parser.endDocument() @parent.endDocument() end parser end private :createPartialParser def parseExternalEntity(context, content) if !context ## external DTD subset ## raise ParseError.new('external DTD subset not implemented') pp = createPartialParser(nil) pp.setBase(content.getURIBase) pp.parseDTDSubset(content, &@block) elsif content[0] == '%' ## parameter entity ref raise ParseError.new('parameter entity ref not implemented') else ## parse pp = createPartialParser(context) pp.setBase(content.getURIBase) pp.parse(content, &@block) end end ## parse DTD subset def parseDTDSubset(content, external = true, &block) if content @content = content if !content.is_a?(InputStream) && content.respond_to?('read') @content = InputStream.new(content) end @block = block @pos = 0 end first = true while !(token = nextToken).nil? break if token == ']' if external && first first = false if token =~ /\A<\?xml/u ## Text Declaration if (token =~ /\A<\?xml([ \t\n\r]+version[ \t\n\r]*=[ \t\n\r]*(['"])([a-zA-Z0-9_.:\-]+)\2)?([ \t\n\r]+encoding[ \t\n\r]*=[ \t\n\r]*(['"])(.*?)\5)?[ \t\n\r]*\?>/u) != 0 raise ParseError.new("illegal Text declaration") end version = $3 encoding = $6 if encoding && !checkEncName(encoding) raise ParseError.new("invalid encoding: #{encoding.inspect}") end if !encoding raise ParseError.new("invalid text declaration") end if version && version != '1.0' && version != '1.1' raise ParseError.new("version #{version} not supported") end if version @version = version end if encoding && @content.is_a?(InputStream) @content.setEncoding(encoding.downcase) end lastContent = :TEXT_DECL end ## pre-parse processing after /Text Declaration doPreParseProcessing token = normalizeLineBreak(token) next if lastContent == :TEXT_DECL end if token == '\z/)) == 'NDATA' notation = nextToken(true) token = nextToken extEnt[2] = notation if !checkNameChar(notation) raise ParseError.new("illegal notation name: #{notation.inspect}") end if @nssep && notation.index(':') raise ParseError.new("illegal notation name: #{notation.inspect}") end ## !!! check notation name to be declared end else token = nextToken end if token != '>' raise ParseError.new("entity declaration parse error") end if extEnt && (@standalone || !@paramEntRef) registered = registerEntity(isPE ? "%#{name}" : name, extEnt) end if registered if block_given? @block.call(:ENTITY_DECL, name, [isPE, value, @base, extid, pubid, notation], self) else entityDecl(name, isPE, value, @base, extid, pubid, notation) end end elsif token == '' attname = token if !checkNameChar(attname) raise ParseError.new("invalid attribute name: #{attname.inspect}") end if @nssep && !checkQNameChar(attname) raise ParseError.new("illegal attribute qname: #{attname.inspect}") end atttype = nextToken(true) if atttype !~ /^(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|NOTATION|\()$/ raise ParseError.new("invalid attribute type: #{atttype.inspect}") end if atttype == 'NOTATION' if (token = nextToken(true)) != '(' raise ParseError.new("invalid enumerated attribute declaration: #{token.inspect}") end end enum = [] if atttype == '(' || atttype == 'NOTATION' enum << nextToken if !checkNmtoken(enum[-1]) raise ParseError.new("invalid nmtoken: #{enum[-1]}") end while (token = nextToken) != ')' if token != '|' raise ParseError.new("invalid attribute enumeration delimiter: #{token.inspect}") end enum << nextToken if !checkNmtoken(enum[-1]) raise ParseError.new("invalid nmtoken: #{enum[-1]}") end end if atttype == 'NOTATION' atttype = "NOTATION(#{enum.join('|')})" else atttype = "(#{enum.join('|')})" end end attval = nextToken(true) attfixed = false if attval == "#FIXED" attfixed = true attval = nextToken(true) end if attval !~ /^(\#REQUIRED|\#IMPLIED|'[^\']*'|"[^\"]*")$/ raise ParseError.new("invalid attribute defaults") end if attval[0] == ?# ## #REQUIRED or #IMPLIED attval = attval.intern else ## string literal attval = attval[1..-2] end if @standalone || !@paramEntRef # atttype = 'TOKEN' if atttype != 'CDATA' attval = expandAttrValue(attval) if attval.is_a?(String) if atttype != "CDATA" && attval.is_a?(String) attval = normalizeAttrTokenValue(attval) end if !@attlist.include?(elname) @attlist[elname] = {} end registered = false if !@attlist[elname].include?(attname) @attlist[elname][attname] = [atttype, attval] registered = true end if registered required = false if attval == "#REQUIRED".intern required = true end attval = nil if attval.is_a?(Symbol) if block_given? @block.call(:ATTLIST_DECL, elname, [attname, atttype, attval, required], self) else attlistDecl(elname, attname, atttype, attval, required) end end end end elsif token == '' || content_model[0] == 'NAME' raise ParseError.new("invalid content model: #{elname.inspect}") end if block_given? @block.call(:ELEMENT_DECL, elname, content_model, self) else elementDecl(elname, content_model) end elsif token == '' extid = token[1..-2] token = nextToken end else raise ParseError.new("invalid notaion decl #{nname.inspect}") end if token != '>' raise ParseError.new("invalid notation decl: #{nname.inspect}") end if block_given? @block.call(:NOTATION_DECL, nname, [@base, extid, pubid], self) else notationDecl(nname, @base, extid, pubid) end elsif token[0] == ?% ## parameter entity reference if token !~ /\A%([^;]+);\z/ raise ParseError.new("illegal PE name: #{token.inspect}") end name = $1 if !checkNameChar(name) raise ParseError.new("illegal PE name: #{name.inspect}") end if @nssep && name.index(':') raise ParseError.new("illegal PE name: #{name.inspect}") end @onlyInternalSubset = false if @entityContext.include?("%#{name}") raise ParseError.new("recursive entity reference: %#{name} in #{@entityContext.reverse.join(' in ')}") end val = getEntity("%#{name}") @entityContext.push("%#{name}") if val.is_a?(Array) || !val @paramEntRef = true else pp = createPartialParser(token) pp.setBase(@base) pp.parseDTDSubset(val, &@block) end @entityContext.pop elsif token =~ /\A<\?/u ## PI if token !~ /\?>\z/u raise ParseError.new("processing instruction data expected") end part = token[2..-3] ## strip "" part =~ /\A([^ \t\n\r]+)([ \t\n\r]+(.*))?\z/mu name = $1 data = $3.to_s if !checkNameChar(name) || name =~ /\Axml\z/i raise ParseError.new("illegal PI name: #{name.inspect}") end if @nssep && name.index(':') raise ParseError.new("illegal PI name: #{name.inspect}") end if block_given? @block.call(:PI, name, data, self) else processingInstruction(name, data) end elsif token =~ /\A\z/u raise ParseError.new("comment must end with \"-->\"") end part = token[4..-4].to_s ## strip "" if part =~ /--/u || part =~ /\A-|-\z/u raise ParseError.new("comment must not contain '--'") end if block_given? @block.call(:COMMENT, nil, part, self) else comment(part) end else raise ParseError.new("unsupported token: #{token.inspect}") # print "Token: #{token}\n" end end end def inspect sprintf("#<%s: id=%d>", self.class.to_s, object_id) end def setBase(base) if base.nil? @base = base else @base = base.dup.freeze end end def getBase @base end def line return 0 if @content.nil? || @pos.nil? || @content.length == 0 @content[0..@pos].count("\n") + 1 end def getSpecifiedAttributes @specifiedAttrs end def setReturnNSTriplet(flag) ## dummy end def getContentURI if @content.is_a?(InputStream) return [@content.uri, line] elsif @parent && @content.nil? return @parent.getContentURI end ["-", line] end ## ## Default handler ## protected def character(text) end def xmlDecl(version, encoding, standalone) end def processingInstruction(name, data) end def comment(data) end def startElement(name, attrs) end def endElement(name) end def startNamespaceDecl(prefix, uri) end def endNamespaceDecl(prefix) end def elementDecl(name, model) end def attlistDecl(elname, attname, att_type, dflt, isrequired) end def notationDecl(notationName, base, systemId, publicId) end def entityDecl(entityName, isparameter_entity, value, base, systenId, publicId, notationName) end def externalEntityRef(context, base, systemId, publicId) end def skippedEntity(name, isParameterEntity) end def startCdata() end def endCdata() end def startDocument() end def endDocument() end end ## listener module Listener attr_reader :currentParser def listen self.method(:dispatch) end public :listen private def dispatch(type, name, data, parser) @currentParser = parser case type when :CDATA character(data) when :XML_DECL xmlDecl(*data) when :PI processingInstruction(name, data) when :COMMENT comment(data) when :START_ELEM startElement(name, data) when :END_ELEM endElement(name) when :START_NAMESPACE_DECL startNamespaceDecl(name, data) when :END_NAMESPACE_DECL endNamespaceDecl(name) when :ELEMENT_DECL elementDecl(name, data) when :ATTLIST_DECL attlistDecl(name, *data) when :NOTATION_DECL notationDecl(name, *data) when :ENTITY_DECL entityDecl(name, *data) when :EXTERNAL_ENTITY_REF externalEntityRef(name, *data) when :SKIPPED_ENTITY skippedEntity(name, data) when :START_CDATA startCdata() when :END_CDATA endCdata() when :START_DOCUMENT startDocument() when :END_DOCUMENT endDocument() when :EXTERNAL_ENTITY_REF externalEntityRef(name, *data) else ename = type.to_s.downcase.gsub(/_([a-z])/) { $1.upcase } if respond_to?(ename) send(ename, name, *data) else raise YmXML::ParseError.new("unknown event: #{type}") end end end protected def character(text) end def xmlDecl(version, encoding, standalone) end def processingInstruction(name, data) end def comment(data) end def startElement(name, attrs) end def endElement(name) end def startNamespaceDecl(prefix, uri) end def endNamespaceDecl(prefix) end def elementDecl(name, model) end def attlistDecl(elname, attname, att_type, dflt, isrequired) end def notationDecl(notationName, base, systemId, publicId) end def entityDecl(entityName, isparameter_entity, value, base, systenId, publicId, notationName) end def externalEntityRef(context, base, systemId, publicId) end def skippedEntity(name, isParameterEntity) end def startCdata() end def endCdata() end def startDocument() end def endDocument() end end class InputStream attr_reader :uri CP1252_TO_UCS = [0x20ac, 0xfffd, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021, 0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0xfffd, 0x017d, 0xfffd, 0xfffd, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, 0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0xfffd, 0x017e, 0x0178] def combineSurrogatePair(ary) i = 0 len = ary.length ret = [] while i < len c = ary[i] if c >= 0xd800 && c <= 0xdbff && i + 1 < len && ary[i+1] >= 0xdc00 && ary[i+1] <= 0xdfff i += 1 low = ary[i] c = (((c & 1023)) << 10 | (low & 1023)) + 0x10000 end ret << c i += 1 end ret end private :combineSurrogatePair def initialize(stream, encoding = nil, &block) @encoding = encoding ? encoding.downcase : nil @block = block @autodetectedEncoding = nil @uri = nil if stream.is_a?(String) @content = stream else @content = stream.read end taintp = @content.tainted? ## auto encoding detection if @encoding.nil? if @content.length >= 4 if @content[0] == 0xff && @content[1] == 0xfe && @content[2] != 0 ## UTF-16 (LE) @content = combineSurrogatePair(@content[2..-1].unpack('v*')).pack('U*') @autodetectedEncoding = 'utf-16' elsif @content[0] == 0xfe && @content[1] == 0xff && @content[3] != 0 ## UTF-16 (BE) @content = combineSurrogatePair(@content[2..-1].unpack('n*')).pack('U*') @autodetectedEncoding = 'utf-16' elsif @content[0..3] == "<\0?\0" ## UTF-16LE @content = combineSurrogatePair(@content.unpack('v*')).pack('U*') @autodetectedEncoding = 'utf-16le' elsif @content[0..3] == "\0<\0?" ## UTF-16BE @content = combineSurrogatePair(@content.unpack('n*')).pack('U*') @autodetectedEncoding = 'utf-16be' elsif @content[0] == 0xff && @content[1] == 0xfe && @content[2] == 0 && @content[3] == 0 ## UTF-32 (LE) @content = @content[4..-1].unpack('V*').pack('U*') @autodetectedEncoding = 'utf-32' elsif @content[0] == 0 && @content[1] == 0 && @content[2] == 0xfe && @content[3] == 0xff ## UTF-32 (BE) @content = @content[4..-1].unpack('N*').pack('U*') @autodetectedEncoding = 'utf-32' elsif @content[0..7] == "<\0\0\0?\0\0\0" ## UTF-32LE @content = @content.unpack('V*').pack('U*') @autodetectedEncoding = 'utf-32le' elsif @content[0..7] == "\0\0\0<\0\0\0?" ## UTF-32BE @content = @content.unpack('N*').pack('U*') @autodetectedEncoding = 'utf-32be' elsif @content[0] == 0xef && @content[1] == 0xbb && @content[2] == 0xbf ## UTF-8 (BOM) @content = @content[3..-1] @autodetectedEncoding = 'utf-8' elsif @content[0..3] == "\x4c\x6f\xa7\x94" || @content[0..3] == "\x4c\x6f\xb7\x75" || @content[0..3] == "\x4c\x6f\xab\x73" # elsif @content[0] == 0x4c && @content[1] == 0x6f && # @content[2] == 0xa7 && @content[3] == 0x94 raise EncodingError.new("EBCDIC not supported") end end elsif @encoding == 'us-ascii' ## no conversion elsif @encoding == 'utf-8' ## delete BOM if @content[0] == 0xef && @content[1] == 0xbb && @content[2] == 0xbf ## UTF-8 (BOM) @content = @content[3..-1] end elsif @encoding == 'utf-16' if @content.length >= 4 if @content[0] == 0xff && @content[1] == 0xfe && @content[2] != 0 ## UTF-16 (LE) @content = combineSurrogatePair(@content[2..-1].unpack('v*')).pack('U*') elsif @content[0] == 0xfe && @content[1] == 0xff && @content[3] != 0 ## UTF-16 (BE) @content = combineSurrogatePair(@content[2..-1].unpack('n*')).pack('U*') else raise EncodingError.new("illegal UTF-16 sequence") end end elsif @encoding == 'utf-16le' @content = combineSurrogatePair(@content.unpack('v*')).pack('U*') elsif @encoding == 'utf-16be' @content = combineSurrogatePair(@content.unpack('n*')).pack('U*') elsif @encoding == 'utf-32' if @content[0] == 0xff && @content[1] == 0xfe && @content[2] == 0 && @content[3] == 0 ## UTF-32 (LE) @content = @content[4..-1].unpack('V*').pack('U*') elsif @content[0] == 0 && @content[1] == 0 && @content[2] == 0xfe && @content[3] == 0xff ## UTF-32 (BE) @content = @content[4..-1].unpack('N*').pack('U*') else raise EncodingError.new("illegal UTF-32 sequence") end elsif @encoding == 'utf-32le' @content = @content.unpack('V*').pack('U*') elsif @encoding == 'utf-32be' @content = @content.unpack('N*').pack('U*') else @content = unknownEncoding(@encoding, @content) end @content.taint if taintp end def setEncoding(encoding) return if @encoding ## already set if @autodetectedEncoding && @autodetectedEncoding != encoding raise EncodingError.new("encoding does not match auto detected encoding (#{@autodetectedEncoding}): #{encoding.inspect}") end @encoding = encoding return if encoding == 'utf-8' || encoding == 'us-ascii' || encoding == @autodetectedEncoding @content = unknownEncoding(encoding, @content) end def self._getURIBase(uri = nil) uri =~ /^(.*?\/?)[^\/]*$/ $1 end def self._getURIHost(uri) uri =~ /^((https?|ftp|file):\/\/[^\/]*\/?).*$/ $1 end def self._catURI(baseuri, uri) baseuri = baseuri.to_s if uri =~ /^([a-zA-Z]+):/ uri elsif uri =~/^\// host = _getURIHost(baseuri) host =~ /^(.*?)\/?$/ $1.to_s + uri else base = _getURIBase(baseuri) # base =~ /^(.*?)\/?$/ base + uri end end def setURI(uri) @uri = uri end def getURI @uri end def getURIBase self.class._getURIBase(@uri) end def self.openFile(file, encoding = nil, &block) ret = self.new(open(file), encoding, &block) ret.setURI(file) ret end begin require 'open-uri'; @@FETCH_CMD = proc {|uri| open(uri) } rescue LoadError @@FETCH_CMD = '/usr/bin/curl -s' end def self.setURIResolver(cmd) @@FETCH_CMD = cmd end def self.openURI(uri, base = nil, encoding = nil, &block) uri = _catURI(base, uri) if @@FETCH_CMD.is_a?(Proc) ret = self.new(@@FETCH_CMD.call(uri), encoding, &block) else if uri =~ /^(https?|ftp|file):/ ret = self.new(open("|#{@@FETCH_CMD} '#{uri}'"), encoding, &block) else ret = self.new(open(uri), encoding, &block) end end ret.setURI(uri) ret end def index(pat, start) return nil if @content.nil? @content.index(pat, start) end def length return 0 if @content.nil? @content.length end def [](pos, len = nil) return nil if @content.nil? if len.nil? @content[pos] else @content[pos, len] end end def []=(pos, value1, value2 = nil) return if @content.nil? if value2.nil? @content[pos] = value1 else @content[pos, value1] = value2 end end def unknownEncoding(encoding, content) case encoding when 'euc-jp' begin require 'nkf' if NKF::UTF8 return NKF.nkf('-Ewm0x', content) end rescue LoadError,NameError end begin require 'iconv' return Iconv.iconv('UTF-8', 'EUC-JP', content).join('') rescue LoadError end begin require 'uconv' return Uconv.euctou8(content) rescue LoadError end when 'shift_jis' begin require 'nkf' if NKF::UTF8 return NKF.nkf('-Swm0x', content) end rescue LoadError,NameError end begin require 'iconv' return Iconv.iconv('UTF-8', 'SHift_JIS', content).join('') rescue LoadError end begin require 'uconv' return Uconv.sjistou8(content) rescue LoadError end when 'iso-2022-jp' begin require 'nkf' if NKF::UTF8 return NKF.nkf('-Jwm0x', content) end rescue LoadError,NameError end begin require 'iconv' require 'nkf' return Iconv.iconv('UTF-8', 'EUC-JP', NKF.nkf('-Jem0x', content)).join('') rescue LoadError end begin require 'uconv' require 'nkf' return Uconv.euctou8(NKF.nkf('-Jem0x', content)) rescue LoadError end when 'iso-8859-1' return @content.gsub(/([\x80-\xff])/n) {|m| [m[0]].pack('U') } when 'windows-1252' return @content.gsub(/([\x80-\xff])/n) {|m| m[0] < 0xa0 ? [CP1252_TO_UCS[m[0] - 0x80]].pack('U') : [m[0]].pack('U') } else if @block return @block.call(encoding, content) end end raise EncodingError.new("unknown encoding: #{encoding.inspect}") end private :unknownEncoding end end if $0 == __FILE__ $OPT_i = false $OPT_e = nil begin require 'optparse' ARGV.options do |o| o.banner << ' ' o.on('-i', '--ignore-ns', 'ignore namespaces') do $OPT_i = true end o.on('-e', '--encoding ENCODING', 'force input character encoding') do |arg| $OPT_e = arg end o.parse! end rescue LoadError require 'parsearg' $USAGE = 'print "Usage: #{$0} [-i] [-e ] -i: ignore namespaces -e: force input character encoding\n"' parseArgs(0, nil, "i", "e:") end nssep = if !$OPT_i then '|' else nil end ## YmXML::InputStream.setURIResolver("wget -O - -o /dev/null") stream = if ARGV.length == 0 YmXML::InputStream.new($<, $OPT_e) elsif /^(https?|ftp|file):/ =~ ARGV[0] YmXML::InputStream.openURI(ARGV[0], nil, $OPT_e) else YmXML::InputStream.openFile(ARGV[0], $OPT_e) end begin parser = YmXML::Parser.new(nil, nssep) parser.setBase(stream.getURIBase) parser.parse(stream) do |t, n, d, pp| p([t, n, d]) if t == :EXTERNAL_ENTITY_REF begin pp.parseExternalEntity(n, YmXML::InputStream.openURI(d[1], d[0])) rescue Errno::ENOENT warn "file not found: #{d[1]}" end end end rescue YmXML::Error => e uri, l = parser.getContentURI $stderr.puts "#{uri}(#{l}): #{e}" $stderr.puts e.backtrace.join("\n\tfrom ") end end