#!/usr/local/bin/ruby
# review search (html decode)
# 2003.2.7 ver.0.1 by ippo - 2003.2.18 ver.0.?
#require 'simpleuri'
require 'webagent'
require 'kconv'
class ReviewSearch
def initialize
@script_url = ENV['SCRIPT_URI'].to_s
# @script_url = 'http://' + ENV['REMOTE_HOST'].to_s + ENV['REQUEST_URI'].to_s
#REMOTE_HOST = REMOTE_ADDR ?
@stderr = $stderr
end
#========== 取得・出力 ========== ========== ==========
def decode_html( url = nil, flag_decode = nil, reviewer = nil, flag_name = nil, mode = nil, bodyfile = nil )
head, file = File.split(url)
if url.sub(/^http:\/\//,'') !~ /\//
url = url + '/'
else
url = url + '/' if file !~ /\./
end
list = []
return list if url.nil?
message = 'CHK; access; ' + url
@messageKcode = 3 #1-jis,2-euc,3-sjis
@insystemKcode = 2 #euc
# @stderr.puts message.kconv(@messageKcode)
body = bodyfile
if body.nil?
agent = WebAgent.new()
agent.uri = url
agent.get()
body = agent.body.kconv(@insystemKcode)
body = engine_a_decode(body, flag_name)
end
case flag_decode
when 'amazon'
list = decode_amazon(body, flag_name)
when 'bk1'
list = decode_bk1(body, flag_name)
when 'none'
list = decode_none(body, flag_name)
when 'bk1_bibid'
list = decode_bk1_bibid(body, flag_name)
else
list = decode_refer(body, flag_name)
end
reviewer = reviewer.to_s.strip
if reviewer != ''
list.each_index{ |i| list[i][-1] = reviewer }
end
if mode.nil?
url_org = URI.parse(url)
list.each_index{ |i|
if flag_name == 'name'
name = list[i][1].to_s
list[i][1] = url.sub(/#.*$/,'')
list[i][1] += '#' + name if name != ''
else
list[i][1] = url_org.merge(list[i][1]).to_s
end
}
else
head, file = File.split(url)
file.gsub!(/\#.*$/,'')
list.each_index{ |i|
if flag == 'name'
if list[i][1].to_s == ''
list[i][1] = file
else
list[i][1] = file + '#' + list[i][1]
end
end
}
end
=begin
if flag_name == 'name'
head, file = File.split(url)
file.gsub!(/\#.*$/,'')
list.each_index{ |i| #url:name->file#name
list[i][1] = file + '#' + list[i][1] if list[i][1].to_s != ''
list[i][1] = file if list[i][1].to_s == ''
}
end
if mode.nil?
head, file = File.split(url)
list.each_index{ |i| #url:local->global
list[i][1] = file.gsub(/#.*?$/,'') + list[i][1] if list[i][1] =~ /^#/
list[i][1] = File.join(head, list[i][1])
}
end
list.each_index{ |i|
list[i][1] = url_check(list[i][1])
}
=end
return list
end
#========== 小物 ========== ========== ==========
def url_check ( url )
dirs = url.sub(/^http:\/\//mi,'').split('/')
dirs.delete('')
dirs.delete('.')
while dirs.include?('..')
i = 1
while dirs[i] != '..'
i += 1
end
dirs = dirs[0..i-2] + dirs[i+1..-1]
end
url = 'http://' + dirs.join('/')
return url
end
#========== 展開・解析 ========== ========== ==========
def engine_a_decode( text , flag_name = nil)
texts = text.split(/]+>)/mi
$1.to_s.split.each{ |check|
case flag_name
when 'name'
if (check.strip =~ /^id=([^>]+)/mie)
newname = $1
flag = true
end
if (check.strip =~ /^id=\"(.+?)\"/mie)
newname = $1
flag = true
end
if (check.strip =~ /^name=([^>]+)/mie)
newname = $1
flag = true
end
if (check.strip =~ /^name=\"(.+?)\"/mie)
newname = $1
flag = true
end
else
if (check.strip =~ /^href=\"(.+?)\"/mi)
tmpurl = $1
if (tmpurl !~ /^http:\/\//) && (tmpurl !~ /^mailto:/)
flag = true
newurl = tmpurl
end
end
end
}
if flag
newtexts << [newblock, name, url]
newblock, name, url = '', newname, newurl
newblock << ' 0
return newtexts
end
def decode_refer( text , flag = nil) #独自タグ解釈
list = []
texts = text
texts = engine_a_decode(text, flag) if text.type == 'String'
texts.each{ |block, blockname, blockurl|
blockurl = blockname if flag == 'name'
block.scan(//){ |str|
code, url, reviewer, urlmemo, point, memo = '', blockurl, '', '', '', ''
vals = str.first.split(':')
vals.each_index{ |i|
var = vals[i]
code = var.strip if var =~ /^ISBN/
url = var.sub(/^url=/,'').strip if var =~ /^url=/
reviewer = var.sub(/^reviewer=/,'').strip if var =~ /^reviewer=/
urlmemo = var.sub(/^urlmemo=/,'').strip if var =~ /^urlmemo=/
point = var.sub(/^point=/,'').strip if var =~ /^point=/
memo = var.sub(/^memo=/,'').strip if var =~ /^memo=/
url = 'http:' + vals[i+1].strip if var =~ /^url=http$/
}
list << [code, url, reviewer, urlmemo, point, memo] if (code + url + reviewer) != ''
}
}
list = list.uniq
return list
end
=begin
def decode_refer( text )
list = []
text.scan(//) { |str|
code, url, reviewer = '', '', ''
str.first.split(':').each{ |var|
code = var.strip if var =~ /^ISBN/
url = var.sub(/^url=/,'').strip if var =~ /^url=/
reviewer = var.sub(/^reviewer=/,'').strip if var =~ /^reviewer=/
}
list << [code, url, reviewer] if (code + url + reviewer) != ''
}
return list
end
=end
def decode_amazon( text, flag = nil )
list = []
texts = text
texts = engine_a_decode(text, flag) if text.type == 'String'
texts.each{ |block, blockname, blockurl|
blockurl = blockname if flag == 'name'
code, url, reviewer = '', blockurl, ''
block.scan(/href=\"http:\/\/www.amazon.co.jp\/exec\/obidos\/ASIN\/(.+?)\//mi){ |str|
asin = str.first
if asin =~ /^(\d+X?)$/i #CDとかのASINは先頭が数字でない様子
code = 'ISBN' + $1.upcase.strip
list << [code, url, reviewer]
end
}
}
list = list.uniq
return list
end
def decode_bk1( text, flag = nil )
list = []
texts = text
texts = engine_a_decode(text, flag) if text.type == 'String'
texts.each{ |block, blockname, blockurl|
blockurl = blockname if flag == 'name'
code, url, reviewer = '', blockurl, ''
block.scan(/href=\"http:\/\/www.bk1.co.jp\/cgi-bin\/srch\/srch_result_book.cgi(.+?)\"/mi){ |str|
if str.first =~ /isbn=([^&]+)/mi
code = 'ISBN' + $1.gsub(/\s/,'').upcase.strip
list << [code, url, reviewer]
end
}
}
list = list.uniq
return list
end
def decode_none( text , flag = nil)
list = []
texts = text
texts = engine_a_decode(text, flag) if text.type == 'String'
texts.each{ |block, blockname, blockurl|
blockurl = blockname if flag == 'name'
code, url, reviewer = '', blockurl, ''
block.scan(/(ISBN|ISBN)[:|:| |\s]*?(\d[\d-]+X?)/emi){ |str|
code = 'ISBN' + str[1].upcase.strip
list << [code, url, reviewer]
}
}
list = list.uniq
return list
end
def decode_bk1_bibid( text, flag = nil )
list = []
texts = text
texts = engine_a_decode(text, flag) if text.type == 'String'
texts.each{ |block, blockname, blockurl|
blockurl = blockname if flag == 'name'
code, url, reviewer = '', blockurl, ''
block.scan(/href=\"http:\/\/www.bk1.co.jp\/cgi-bin\/srch\/srch_detail.cgi(.+?)\"/mi){ |str|
if str.first =~ /bibid=([^&]+)/mi
code = 'bibid' + $1.strip
list << [code, url, reviewer]
end
}
}
list = list.uniq
return list
end
#========== 表示・飾り用 ========== ========== ==========
def button_regist ( code, url, reviewer = '', urlmemo = '', point = '', memo = '', target = 'blank', script_name = '')
com =<<-END
END
com = com.gsub(/\n/,'').gsub(/\s+/,' ').strip
# "登録"
#POSTにしないとURLの「#」で引っかかる?
return com
end
def link_amazon ( code , asid = 'ipposjunkbox-22', target = 'blank' )
code = "#{code}"
return code
end
def link_url ( url, target = 'blank' )
url = "#{url}"
return url
end
end
=begin
rs_sample = ReviewSearch.new()
#list = rs_sample.decode_html('http://ippo.s5.xrea.com/diary/s_diary.html', 'amazon', 'r-ippo')
#list = rs_sample.decode_html('http://ippo.s5.xrea.com/diary/s_diary.html', nil, 'r-ippo', nil, 'local')
#list = rs_sample.decode_html('http://ippo.s5.xrea.com/diary/s_d200302.html', 'amazon', 'r-ippo', 'name', 'local')
#list = rs_sample.decode_html('http://alisato.cool.ne.jp/diary/200302a.html', nil, nil, 'name')
list = rs_sample.decode_html('http://alisato.cool.ne.jp/diary/200302a.html', 'none', nil, 'name')
p list
#p rs_sample.button_regist('ISBNtest', 'test.html', 'testman')
#p rs_sample.link_amazon('ISBNtest')
#p rs_sample.link_url('test.html')
=end
=begin
#おおたさんへ:多分こんな感じでいけるかと。
rs = ReviewSearch.new()
wa = WebAgent.new()
wa.uri = 'http://www14.cds.ne.jp/~not/antenna/ndiary.html'
wa.get()
abody = wa.body.kconv(2)
list = []
alllist = []
targets = ['ISBN4061821725','ISBN4086001977']
#煙か土か食い物
#天気晴朗
urllist = []
abody.scan(/href=\"(.+?)\"/){ |str|
url = str.first
urllist << url if url =~ /^http:\/\//
}
urllist.uniq.each{ |url|
$stderr.puts url
list0 = []
begin
wa.uri = url
wa.get()
body = wa.body.kconv(2)
list1 = rs.decode_html(url, 'none' , nil, nil, nil, body)
list2 = rs.decode_html(url, 'amazon', nil, nil, nil, body)
list3 = rs.decode_html(url, 'bk1' , nil, nil, nil, body)
list0 = list1 + list2 + list3
rescue
end
list0.uniq.each{ |line|
targets.each{ |target|
if line[0].to_s.gsub(/-/,'') == target
list << [line[0], line[1]]
$stderr.puts "CHK; #{line[0]} #{line[1]}"
end
}
}
alllist += list0.uniq
}
list.uniq.sort.each{ |line| puts line.join(',') }
=end