#!/usr/local/bin/ruby # review search (html decode) # 2003.2.7 ver.0.1 by ippo - 2003.2.18 ver.0.? #require 'simpleuri' require 'webagent' require 'kconv' class ReviewSearch def initialize @script_url = ENV['SCRIPT_URI'].to_s # @script_url = 'http://' + ENV['REMOTE_HOST'].to_s + ENV['REQUEST_URI'].to_s #REMOTE_HOST = REMOTE_ADDR ? @stderr = $stderr end #========== 取得・出力 ========== ========== ========== def decode_html( url = nil, flag_decode = nil, reviewer = nil, flag_name = nil, mode = nil, bodyfile = nil ) head, file = File.split(url) if url.sub(/^http:\/\//,'') !~ /\// url = url + '/' else url = url + '/' if file !~ /\./ end list = [] return list if url.nil? message = 'CHK; access; ' + url @messageKcode = 3 #1-jis,2-euc,3-sjis @insystemKcode = 2 #euc # @stderr.puts message.kconv(@messageKcode) body = bodyfile if body.nil? agent = WebAgent.new() agent.uri = url agent.get() body = agent.body.kconv(@insystemKcode) body = engine_a_decode(body, flag_name) end case flag_decode when 'amazon' list = decode_amazon(body, flag_name) when 'bk1' list = decode_bk1(body, flag_name) when 'none' list = decode_none(body, flag_name) when 'bk1_bibid' list = decode_bk1_bibid(body, flag_name) else list = decode_refer(body, flag_name) end reviewer = reviewer.to_s.strip if reviewer != '' list.each_index{ |i| list[i][-1] = reviewer } end if mode.nil? url_org = URI.parse(url) list.each_index{ |i| if flag_name == 'name' name = list[i][1].to_s list[i][1] = url.sub(/#.*$/,'') list[i][1] += '#' + name if name != '' else list[i][1] = url_org.merge(list[i][1]).to_s end } else head, file = File.split(url) file.gsub!(/\#.*$/,'') list.each_index{ |i| if flag == 'name' if list[i][1].to_s == '' list[i][1] = file else list[i][1] = file + '#' + list[i][1] end end } end =begin if flag_name == 'name' head, file = File.split(url) file.gsub!(/\#.*$/,'') list.each_index{ |i| #url:name->file#name list[i][1] = file + '#' + list[i][1] if list[i][1].to_s != '' list[i][1] = file if list[i][1].to_s == '' } end if mode.nil? head, file = File.split(url) list.each_index{ |i| #url:local->global list[i][1] = file.gsub(/#.*?$/,'') + list[i][1] if list[i][1] =~ /^#/ list[i][1] = File.join(head, list[i][1]) } end list.each_index{ |i| list[i][1] = url_check(list[i][1]) } =end return list end #========== 小物 ========== ========== ========== def url_check ( url ) dirs = url.sub(/^http:\/\//mi,'').split('/') dirs.delete('') dirs.delete('.') while dirs.include?('..') i = 1 while dirs[i] != '..' i += 1 end dirs = dirs[0..i-2] + dirs[i+1..-1] end url = 'http://' + dirs.join('/') return url end #========== 展開・解析 ========== ========== ========== def engine_a_decode( text , flag_name = nil) texts = text.split(/]+>)/mi $1.to_s.split.each{ |check| case flag_name when 'name' if (check.strip =~ /^id=([^>]+)/mie) newname = $1 flag = true end if (check.strip =~ /^id=\"(.+?)\"/mie) newname = $1 flag = true end if (check.strip =~ /^name=([^>]+)/mie) newname = $1 flag = true end if (check.strip =~ /^name=\"(.+?)\"/mie) newname = $1 flag = true end else if (check.strip =~ /^href=\"(.+?)\"/mi) tmpurl = $1 if (tmpurl !~ /^http:\/\//) && (tmpurl !~ /^mailto:/) flag = true newurl = tmpurl end end end } if flag newtexts << [newblock, name, url] newblock, name, url = '', newname, newurl newblock << ' 0 return newtexts end def decode_refer( text , flag = nil) #独自タグ解釈 list = [] texts = text texts = engine_a_decode(text, flag) if text.type == 'String' texts.each{ |block, blockname, blockurl| blockurl = blockname if flag == 'name' block.scan(//){ |str| code, url, reviewer, urlmemo, point, memo = '', blockurl, '', '', '', '' vals = str.first.split(':') vals.each_index{ |i| var = vals[i] code = var.strip if var =~ /^ISBN/ url = var.sub(/^url=/,'').strip if var =~ /^url=/ reviewer = var.sub(/^reviewer=/,'').strip if var =~ /^reviewer=/ urlmemo = var.sub(/^urlmemo=/,'').strip if var =~ /^urlmemo=/ point = var.sub(/^point=/,'').strip if var =~ /^point=/ memo = var.sub(/^memo=/,'').strip if var =~ /^memo=/ url = 'http:' + vals[i+1].strip if var =~ /^url=http$/ } list << [code, url, reviewer, urlmemo, point, memo] if (code + url + reviewer) != '' } } list = list.uniq return list end =begin def decode_refer( text ) list = [] text.scan(//) { |str| code, url, reviewer = '', '', '' str.first.split(':').each{ |var| code = var.strip if var =~ /^ISBN/ url = var.sub(/^url=/,'').strip if var =~ /^url=/ reviewer = var.sub(/^reviewer=/,'').strip if var =~ /^reviewer=/ } list << [code, url, reviewer] if (code + url + reviewer) != '' } return list end =end def decode_amazon( text, flag = nil ) list = [] texts = text texts = engine_a_decode(text, flag) if text.type == 'String' texts.each{ |block, blockname, blockurl| blockurl = blockname if flag == 'name' code, url, reviewer = '', blockurl, '' block.scan(/href=\"http:\/\/www.amazon.co.jp\/exec\/obidos\/ASIN\/(.+?)\//mi){ |str| asin = str.first if asin =~ /^(\d+X?)$/i #CDとかのASINは先頭が数字でない様子 code = 'ISBN' + $1.upcase.strip list << [code, url, reviewer] end } } list = list.uniq return list end def decode_bk1( text, flag = nil ) list = [] texts = text texts = engine_a_decode(text, flag) if text.type == 'String' texts.each{ |block, blockname, blockurl| blockurl = blockname if flag == 'name' code, url, reviewer = '', blockurl, '' block.scan(/href=\"http:\/\/www.bk1.co.jp\/cgi-bin\/srch\/srch_result_book.cgi(.+?)\"/mi){ |str| if str.first =~ /isbn=([^&]+)/mi code = 'ISBN' + $1.gsub(/\s/,'').upcase.strip list << [code, url, reviewer] end } } list = list.uniq return list end def decode_none( text , flag = nil) list = [] texts = text texts = engine_a_decode(text, flag) if text.type == 'String' texts.each{ |block, blockname, blockurl| blockurl = blockname if flag == 'name' code, url, reviewer = '', blockurl, '' block.scan(/(ISBN|ISBN)[:|:| |\s]*?(\d[\d-]+X?)/emi){ |str| code = 'ISBN' + str[1].upcase.strip list << [code, url, reviewer] } } list = list.uniq return list end def decode_bk1_bibid( text, flag = nil ) list = [] texts = text texts = engine_a_decode(text, flag) if text.type == 'String' texts.each{ |block, blockname, blockurl| blockurl = blockname if flag == 'name' code, url, reviewer = '', blockurl, '' block.scan(/href=\"http:\/\/www.bk1.co.jp\/cgi-bin\/srch\/srch_detail.cgi(.+?)\"/mi){ |str| if str.first =~ /bibid=([^&]+)/mi code = 'bibid' + $1.strip list << [code, url, reviewer] end } } list = list.uniq return list end #========== 表示・飾り用 ========== ========== ========== def button_regist ( code, url, reviewer = '', urlmemo = '', point = '', memo = '', target = 'blank', script_name = '') com =<<-END
END com = com.gsub(/\n/,'').gsub(/\s+/,' ').strip # "
登録" #POSTにしないとURLの「#」で引っかかる? return com end def link_amazon ( code , asid = 'ipposjunkbox-22', target = 'blank' ) code = "#{code}" return code end def link_url ( url, target = 'blank' ) url = "#{url}" return url end end =begin rs_sample = ReviewSearch.new() #list = rs_sample.decode_html('http://ippo.s5.xrea.com/diary/s_diary.html', 'amazon', 'r-ippo') #list = rs_sample.decode_html('http://ippo.s5.xrea.com/diary/s_diary.html', nil, 'r-ippo', nil, 'local') #list = rs_sample.decode_html('http://ippo.s5.xrea.com/diary/s_d200302.html', 'amazon', 'r-ippo', 'name', 'local') #list = rs_sample.decode_html('http://alisato.cool.ne.jp/diary/200302a.html', nil, nil, 'name') list = rs_sample.decode_html('http://alisato.cool.ne.jp/diary/200302a.html', 'none', nil, 'name') p list #p rs_sample.button_regist('ISBNtest', 'test.html', 'testman') #p rs_sample.link_amazon('ISBNtest') #p rs_sample.link_url('test.html') =end =begin #おおたさんへ:多分こんな感じでいけるかと。 rs = ReviewSearch.new() wa = WebAgent.new() wa.uri = 'http://www14.cds.ne.jp/~not/antenna/ndiary.html' wa.get() abody = wa.body.kconv(2) list = [] alllist = [] targets = ['ISBN4061821725','ISBN4086001977'] #煙か土か食い物 #天気晴朗 urllist = [] abody.scan(/href=\"(.+?)\"/){ |str| url = str.first urllist << url if url =~ /^http:\/\// } urllist.uniq.each{ |url| $stderr.puts url list0 = [] begin wa.uri = url wa.get() body = wa.body.kconv(2) list1 = rs.decode_html(url, 'none' , nil, nil, nil, body) list2 = rs.decode_html(url, 'amazon', nil, nil, nil, body) list3 = rs.decode_html(url, 'bk1' , nil, nil, nil, body) list0 = list1 + list2 + list3 rescue end list0.uniq.each{ |line| targets.each{ |target| if line[0].to_s.gsub(/-/,'') == target list << [line[0], line[1]] $stderr.puts "CHK; #{line[0]} #{line[1]}" end } } alllist += list0.uniq } list.uniq.sort.each{ |line| puts line.join(',') } =end