Sélectionner une révision Git
get_kaamelott.rb 1,85 Kio
#!/usr/bin/env ruby
# ./get_kaamelott.rb > Data/kaamelott_citations.dat
require 'nokogiri'
require 'open-uri'
$base = 'https://fr.wikiquote.org'
def convert content
content.gsub("\n", ' ').gsub('"', '\\"').gsub('’', "'")
end
def puts_new_page node
link = node.xpath('.//a').attr('href')
new_doc = Nokogiri::HTML(open($base + link))
new_span = true
new_doc.xpath('//div[@id="mw-content-text"]//*[@class = "citation" or @class = "ref"]').each do | node |
if node.name == 'span'
puts ' },' unless new_span
puts " \"#{convert node.content}\": {"
new_span = false
elsif node.name == 'div'
node.content =~ /^([^,]+),\s+\w+,\s+Livre\s+([IVX]+),[^\d\w]*(?:(\d+)[^:]*:\s+)?"?([^,"]+)"?,/
puts " \"acteur\": \"#{$1}\","
puts " \"livre\": \"#{$2}\","
puts " \"episode numero\": \"#{$3}\","
puts " \"episode titre\": \"#{$4}\""
else
raise "Shit! Got this: #{node}"
end
end
end
new_a = true
new_span = false
puts '{'
doc = Nokogiri::HTML(open($base + '/wiki/Kaamelott'))
doc.xpath('//div[@id="mw-content-text"]//*[@class = "extiw" or @class = "citation" or @class = "ref" or self::dl]').each do | node |
if node.name == 'a'
puts ' }' unless new_a
puts " }," unless new_a
puts " \"#{convert node.content}\": {"
new_a = false
new_span = true
elsif node.name == 'dl'
puts_new_page node
break if node.xpath('.//a')[0].content == 'Yvain'
elsif node.name == 'span'
puts ' },' unless new_span
puts " \"#{convert node.content}\": {"
new_span = false
elsif node.name == 'div'
# puts node.content
node.content =~ /^([^,]+),\s+\w+,\s+Livre\s+([IVX]+),\s*(?:ép.\s*)?(\d+)[^:]*:\s*([^,]+),/
puts " \"acteur\": \"#{$1}\","
puts " \"livre\": \"#{$2}\","
puts " \"episode numero\": \"#{$3}\","
puts " \"episode titre\": \"#{$4}\""
else
raise "Shit! Got this: #{node}"
end
end
puts ' }'
puts ' }'