Ruby、HpricotとDiff::LCSで二つのHTMLをエレメント単位でDiffる
Diff::LCSについてはあまり情報がない。Algorithm::Diff@Perlのruby移植らしい。
htmlのエレメントを階層構造を維持したまま一次元配列に展開し、diffる。
現状のコードだとエレメント構造を復元するときエレメントが融合しちゃってアレなんだけど。番兵入れたほうがいいなあ。
諸事情により作ったものの、結局使わなかった
結果の例
入力a
<html> <head> <title>The Blog</title> </head> <body> <h1>The Blog</h1> <h2>エントリ1</h2> <div id="main"> <p>hogehoge</p> <p>hagehage</p> <p>fugafuga</p> </div> <div id="footer"> <p>Counter: 1234</p> <p>Copyright: id:gnarl 2008</p> </div> </body> </html>
入力b
<html> <head> <title>The Blog</title> </head> <body> <h1>The Blog</h1> <h2>エントリ2</h2> <div id="main"> <p>hage</p> <p>hoge</p> </div> <div id="footer"> <p>Counter: 1236</p> <p>Copyright: id:gnarl 2008</p> </div> </body> </html>
結果
<html > <body > <h2 > エントリ1</h2> <div id="main"> <p > hogehoge hagehage fugafuga</p> </div> <div id="footer"> <p > Counter: 1234</p> </div> </body> </html>
ソース
{pull:file path=~/src/sandbox/blogentry/html_diff_cmd.rb last_sync=2008.05.08,18:31} require 'kconv' require 'open-uri' require 'html_diff.rb' unless ARGV.length==2 puts 'usage: html_diff_cmd.rb path_a path_b' exit end def html_diff(path_a,path_b) a=open(path_a){|f|f.read}.toutf8 b=open(path_b){|f|f.read}.toutf8 HTMLDiff.htmldiff(a,b) end print html_diff(ARGV[0],ARGV[1]) {/pull}
{pull:file path=~/src/sandbox/blogentry/html_diff.rb last_sync=2008.05.08,18:30} require 'rubygems' require 'hpricot' require 'cgi' require 'diff/lcs' module HTMLDiff class Element attr_reader :attributes attr_reader :name def initialize(name,attributes) @name=name @attributes=attributes end def to_s "#{name}[#{attributes.map{|k,v|"#{k}=\"#{v.gsub(/\s/,'')}\""}.join(' ')}]" end def ==(rhs) name==rhs.name and attributes==rhs.attributes end def eql?(rhs) self==rhs end end class Node attr_reader :children def initialize(elm) throw "not Element: #{elm.inspect}" if elm.class!=Element @element=elm @children=[] end def name @element.name end def attributes @element.attributes end def insert_elm_at(elm,depth) return children.last.insert_elm_at(elm,depth-1) if 0 < depth children << elm end def to_html case name when 'text' return CGI.escapeHTML(attributes['value']) when 'comment' return attributes['value'] end result='' result << '<' << name << ' ' result << attributes.map{|k,v| "#{CGI.escapeHTML(k)}=\"#{CGI.escapeHTML(v)}\""}.join(' ') if children.empty? result << ' />' << "\n" return result end result << '>' << "\n" result << children.map{|c| next if c.nil? #ignore error c.to_html }.join("\n") result << '</' << name << '>' << "\n" result end def innerText return attributes['value'] if name=='text' children.map{|c|c.innerText}.join end end def self.htmldiff(a,b) h_a=Hpricot(a) h_b=Hpricot(b) # 文字列化してdiffらないと共通部分まで変更ありとみなされてしまう、謎 a_a=create_pathed_elm([],h_a.at('//html')) s_a=a_a.map{|l|l.map{|e|e.to_s}.join('/')} a_b=create_pathed_elm([],h_b.at('//html')) s_b=a_b.map{|l|l.map{|e|e.to_s}.join('/')} diff_s=Diff::LCS.sdiff(s_a,s_b) diff_elm_nums=diff_s.select{|d|d.action=='-' or d.action=='!'}.map{|d|d.old_position} tree=merge_pathed_elm(diff_elm_nums.map{|i|a_a[i]})[0] return tree.to_html end def self.htmlsamepart(a,b,out) h_a=Hpricot(File.read(a)) h_b=Hpricot(File.read(b)) a_a=create_pathed_elm([],h_a.at('//html')) s_a=a_a.map{|l|l.map{|e|e.to_s}.join('/')} a_b=create_pathed_elm([],h_b.at('//html')) s_b=a_b.map{|l|l.map{|e|e.to_s}.join('/')} diff_s=Diff::LCS.sdiff(s_a,s_b) diff_elm_nums=diff_s.select{|d|d.action=='='}.map{|d|d.old_position} tree=merge_pathed_elm(diff_elm_nums.map{|i|a_a[i]})[0] str=tree.to_html open(out,'w'){|f|f.write str} end def self.convert_hpricot_elm(elm) if elm.text? return Element.new( 'text',{'value'=>elm.to_s} ) elsif elm.comment? return Element.new( 'comment',{'value'=>elm.to_s} ) elsif elm.elem? return Element.new( elm.name,elm.attributes ) end throw 'unknown tag: '+elm.class.name end # memo: input is hpricot elem def self.create_pathed_elm(parent_path,elm) return [] if elm.text? and elm.inner_text.strip.size==0 return [] if elm.bogusetag? cur=parent_path+[convert_hpricot_elm(elm)] return [cur] unless elm.elem? r=[cur] elm.children.each{|c| throw "its array" if c.kind_of? Array r.concat(create_pathed_elm(cur,c)) } r.reject{|l|l.empty?} end def self.elm_array_to_s(elms) elms.map{|line|line.map{|elm|elm.to_s}.join('/')}.join("\n") end def self.merge_pathed_elm(paths) root=Node.new(Element.new('root',{})) prev=[] paths.each{|cur| d=cur.first_diff_pos(prev) root.insert_elm_at(array_to_elm_tree(cur[d...cur.length]),d); prev=cur } root.children end def self.array_to_elm_tree(elms) root=Node.new(Element.new('root',{})) parent=root elms.each{|e| en=Node.new(e) parent.children << en parent=en } root.children[0] end def self.print_dom_structure(node,prev_path='',indent=0) return unless node.elem? puts ' '*indent+node.xpath node.each_child do|child| print_dom_structure child,indent+1 end end end class Array def first_diff_pos(rhs) i=0 while i<self.length and i<rhs.length and self[i]==rhs[i] do i+=1 end i end def different_part(rhs) self[self.first_diff_pos(rhs)...self.length] end def same_part(rhs) self[0...self.first_diff_pos(rhs)] end end {/pull}