Ruby、HpricotとDiff::LCSで二つのHTMLをエレメント単位でDiffる

Diff::LCSについてはあまり情報がない。Algorithm::Diff@Perlruby移植らしい。

htmlのエレメントを階層構造を維持したまま一次元配列に展開し、diffる。
現状のコードだとエレメント構造を復元するときエレメントが融合しちゃってアレなんだけど。番兵入れたほうがいいなあ。

諸事情により作ったものの、結局使わなかった

結果の例

入力a
<html>
	<head>
		<title>The Blog</title>
	</head>
	<body>
		<h1>The Blog</h1>
		<h2>エントリ1</h2>
		<div id="main">
			<p>hogehoge</p>
			<p>hagehage</p>
			<p>fugafuga</p>
		</div>
		<div id="footer">
			<p>Counter: 1234</p>
			<p>Copyright: id:gnarl 2008</p>
		</div>
	</body>
</html>
入力b
<html>
	<head>
		<title>The Blog</title>
	</head>
	<body>
		<h1>The Blog</h1>
		<h2>エントリ2</h2>
		<div id="main">
			<p>hage</p>
			<p>hoge</p>
		</div>
		<div id="footer">
			<p>Counter: 1236</p>
			<p>Copyright: id:gnarl 2008</p>
		</div>
	</body>
</html>
結果
<html >
<body >
<h2 >
エントリ1</h2>

<div id="main">
<p >
hogehoge
hagehage

fugafuga</p>
</div>

<div id="footer">
<p >
Counter: 1234</p>
</div>
</body>
</html>

ソース

{pull:file path=~/src/sandbox/blogentry/html_diff_cmd.rb last_sync=2008.05.08,18:31}
require 'kconv'
require 'open-uri'
require 'html_diff.rb'

unless ARGV.length==2
	puts 'usage: html_diff_cmd.rb path_a path_b'
	exit
end

def html_diff(path_a,path_b)
	a=open(path_a){|f|f.read}.toutf8
	b=open(path_b){|f|f.read}.toutf8
	HTMLDiff.htmldiff(a,b)
end

print html_diff(ARGV[0],ARGV[1])
{/pull}
{pull:file path=~/src/sandbox/blogentry/html_diff.rb last_sync=2008.05.08,18:30}
require 'rubygems'
require 'hpricot'
require 'cgi'
require 'diff/lcs'

module HTMLDiff
	class Element
		attr_reader :attributes
		attr_reader :name
		def initialize(name,attributes)
			@name=name
			@attributes=attributes
		end
		def to_s
		"#{name}[#{attributes.map{|k,v|"#{k}=\"#{v.gsub(/\s/,'')}\""}.join(' ')}]"
		end
		def ==(rhs)
			name==rhs.name and attributes==rhs.attributes
		end
		def eql?(rhs)
			self==rhs
		end
	end
	class Node
		attr_reader :children
		def initialize(elm)
			throw "not Element: #{elm.inspect}" if elm.class!=Element
			@element=elm
			@children=[]
		end
		def name
			@element.name
		end
		def attributes
			@element.attributes
		end
		def insert_elm_at(elm,depth)
			return children.last.insert_elm_at(elm,depth-1) if 0 < depth
			children << elm
		end
		def to_html
			case name
			when 'text'
				return CGI.escapeHTML(attributes['value'])
			when 'comment'
				return attributes['value']
			end
			result=''
			result << '<' << name << ' '
			result << attributes.map{|k,v| "#{CGI.escapeHTML(k)}=\"#{CGI.escapeHTML(v)}\""}.join(' ')
			if children.empty?
				result << ' />' << "\n"
				return result
			end
			result << '>' << "\n"
			result << children.map{|c| 
				next if c.nil? #ignore error
				c.to_html
			}.join("\n")
			result << '</' << name << '>' << "\n"
			result
		end
		def innerText
			return attributes['value'] if name=='text'
			children.map{|c|c.innerText}.join
		end
	end

	def self.htmldiff(a,b)
		h_a=Hpricot(a)
		h_b=Hpricot(b)

		# 文字列化してdiffらないと共通部分まで変更ありとみなされてしまう、謎
		a_a=create_pathed_elm([],h_a.at('//html'))
		s_a=a_a.map{|l|l.map{|e|e.to_s}.join('/')}
		a_b=create_pathed_elm([],h_b.at('//html'))
		s_b=a_b.map{|l|l.map{|e|e.to_s}.join('/')}

		diff_s=Diff::LCS.sdiff(s_a,s_b)
		diff_elm_nums=diff_s.select{|d|d.action=='-' or d.action=='!'}.map{|d|d.old_position}
		tree=merge_pathed_elm(diff_elm_nums.map{|i|a_a[i]})[0]
		return tree.to_html
	end

	def self.htmlsamepart(a,b,out)
		h_a=Hpricot(File.read(a))
		h_b=Hpricot(File.read(b))

		a_a=create_pathed_elm([],h_a.at('//html'))
		s_a=a_a.map{|l|l.map{|e|e.to_s}.join('/')}
		a_b=create_pathed_elm([],h_b.at('//html'))
		s_b=a_b.map{|l|l.map{|e|e.to_s}.join('/')}

		diff_s=Diff::LCS.sdiff(s_a,s_b)
		diff_elm_nums=diff_s.select{|d|d.action=='='}.map{|d|d.old_position}
		tree=merge_pathed_elm(diff_elm_nums.map{|i|a_a[i]})[0]
		str=tree.to_html
		open(out,'w'){|f|f.write str}
	end

	def self.convert_hpricot_elm(elm)
		if elm.text?
			return Element.new( 'text',{'value'=>elm.to_s} )
		elsif elm.comment?
			return Element.new( 'comment',{'value'=>elm.to_s} )
		elsif elm.elem?
			return Element.new( elm.name,elm.attributes )
		end
		throw 'unknown tag: '+elm.class.name
	end

	# memo: input is hpricot elem
	def self.create_pathed_elm(parent_path,elm)
		return [] if elm.text? and elm.inner_text.strip.size==0
		return [] if elm.bogusetag?
		cur=parent_path+[convert_hpricot_elm(elm)]
		return [cur] unless elm.elem?
		r=[cur]
		elm.children.each{|c|
			throw "its array" if c.kind_of? Array
			r.concat(create_pathed_elm(cur,c))
		}
		r.reject{|l|l.empty?}
	end

	def self.elm_array_to_s(elms)
		elms.map{|line|line.map{|elm|elm.to_s}.join('/')}.join("\n")
	end

	def self.merge_pathed_elm(paths)
		root=Node.new(Element.new('root',{}))
		prev=[]
		paths.each{|cur|
			d=cur.first_diff_pos(prev)
			root.insert_elm_at(array_to_elm_tree(cur[d...cur.length]),d);
			prev=cur
		}
		root.children
	end

	def self.array_to_elm_tree(elms)
		root=Node.new(Element.new('root',{}))
		parent=root
		elms.each{|e|
			en=Node.new(e)
			parent.children << en
			parent=en
		}
		root.children[0]
	end

	def self.print_dom_structure(node,prev_path='',indent=0)
		return unless node.elem?
		puts ' '*indent+node.xpath
		node.each_child do|child|
			print_dom_structure child,indent+1
		end
	end
end

class Array
	def first_diff_pos(rhs)
		i=0
		while i<self.length and i<rhs.length and self[i]==rhs[i] do
			i+=1
		end
		i
	end
	def different_part(rhs)
		self[self.first_diff_pos(rhs)...self.length]
	end
	def same_part(rhs)
		self[0...self.first_diff_pos(rhs)]
	end
end
{/pull}