-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathparse-speeches.rb
executable file
·145 lines (124 loc) · 4.09 KB
/
parse-speeches.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env ruby
# frozen_string_literal: true
$LOAD_PATH.unshift "#{File.dirname(__FILE__)}/lib"
require "people"
require "hansard_parser"
require "configuration"
require "optparse"
require "ruby-progressbar"
def parse_date(text)
today = Date.today
case text
when "today"
today
when "yesterday"
today - 1
when "previous-working-day"
# For Sunday (wday 0) and Monday (wday 1) the previous working day is last Friday otherwise it's
# just the previous day
case today.wday
when 0
today - 2
when 1
today - 3
else
today - 1
end
else
Date.parse(text)
end
end
# Defaults
options = { load_database: true, proof: false, force: false, interactive: false }
OptionParser.new do |opts|
opts.banner = <<~USAGE
Usage: parse-speeches.rb [options] <from-date> [<to-date>]
formatting of date:
year.month.day or today or yesterday
USAGE
opts.on("--no-load", "Just generate XML and don't load up database") do |l|
options[:load_database] = l
end
opts.on("--interactive", "Upon error, allow the user to patch interactively") do |l|
options[:interactive] = l
end
opts.on("--proof",
"Only parse dates that are at proof stage. Will redownload and populate html cache for those dates.") do |l|
options[:proof] = l
end
opts.on("--force", "On loading data into database delete records that are not in the XML") do |l|
options[:force] = l
end
end.parse!
if ARGV.size != 1 && ARGV.size != 2
puts "Need to supply one or two dates"
exit
end
from_date = parse_date(ARGV[0])
to_date = if ARGV.size == 1
from_date
else
parse_date(ARGV[1])
end
conf = Configuration.new
FileUtils.mkdir_p "#{conf.xml_path}/origxml/representatives_debates"
FileUtils.mkdir_p "#{conf.xml_path}/origxml/senate_debates"
FileUtils.mkdir_p "#{conf.xml_path}/rewritexml/representatives_debates"
FileUtils.mkdir_p "#{conf.xml_path}/rewritexml/senate_debates"
FileUtils.mkdir_p "#{conf.xml_path}/scrapedxml/representatives_debates"
FileUtils.mkdir_p "#{conf.xml_path}/scrapedxml/senate_debates"
# First load people back in so that we can look up member id's
people = PeopleCSVReader.read_members
parser = HansardParser.new(people)
progress = ProgressBar.create(title: "parse-speeches", total: ((to_date - from_date + 1) * 2).to_i, format: "%t %e: |%B|")
def parse_with_retry(interactive, parse, date, path, house)
parse.call date, path, house
rescue StandardError => e
puts "ERROR While processing #{house} #{date}:"
raise unless interactive
puts e.message
puts e.backtrace.join("\n\t")
loop do
print "Retry / Patch / Continue / Quit? "
choice = $stdin.gets.upcase[0..0]
case choice
when "P"
system "#{File.dirname(__FILE__)}/create_patch.rb #{house} #{date}"
parse_with_retry interactive, parse, date, path, house
break
when "R"
parse_with_retry interactive, parse, date, path, house
break
when "C"
break
when "Q"
raise
end
end
end
# Kind of helpful to start at the end date and go backwards when using the "--proof" option. So, always going to do this now.
date = to_date
while date >= from_date
parse = if options[:proof]
labmda { |a, b, c| parser.parse_date_house_only_in_proof a, b, c }
else
->(a, b, c) { parser.parse_date_house a, b, c }
end
parse_with_retry options[:interactive], parse, date,
"#{conf.xml_path}/scrapedxml/representatives_debates/#{date}.xml", House.representatives
progress.increment
parse_with_retry options[:interactive], parse, date, "#{conf.xml_path}/scrapedxml/senate_debates/#{date}.xml",
House.senate
progress.increment
date -= 1
end
progress.finish
# And load up the database
if options[:load_database]
command_options = +" --from=#{from_date} --to=#{to_date}"
command_options << " --debates"
command_options << " --lordsdebates"
command_options << " --force" if options[:force]
# Starts with 'perl' to be friendly with Windows
system("perl #{conf.web_root}/twfy/scripts/xml2db.pl #{command_options}")
end