-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_speeches.js
executable file
·109 lines (96 loc) · 3.73 KB
/
scrape_speeches.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
const startClip = "AUTHENTICITY CERTIFIED: Text version below transcribed directly from audio";
const startClip2 = "click for pdf";
const endClip = "Book/CDs by Michael E. Eidenmuller"
const endClip2 = "Copyright Status:";
var dlFolder = "raw/"
var dledFiles = fs.readdirSync(dlFolder);
fs.readFile("obama_speeches_id.json", function(err, file) {
/* something's wrong with the file */
if (err) throw err;
data = JSON.parse(file.toString());
for (var i = 0; i < data.length; i++) {
var speech = data[i];
/* Don't download files that have already been downloaded */
var speechFilename = "speech" + speech['id'] + ".txt";
if (dledFiles.indexOf(speechFilename) >= 0) {
console.log(speechFilename + " already exists.");
continue;
}
/**
* Seems like the site doesn't want crawlers, we'll have to spoof some
* headers for our needs
*/
var options = {
url: speech["link"],
headers: {
'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Cookie': 'h2=o; he=llo',
'Upgrade-Insecure-Requests': '1',
'Host': 'www.americanrhetoric.com',
'Speech-Id': String(speech['id'])
}
}
/**
* Performing the actual request to the web page. If there's an error,
* throw it. Else process it.
*/
request(options, function(err, res, body) {
try {
if (err) throw err;
if (res.statusCode == 200) {
$ = cheerio.load(body);
var text = $("td[align=center]").text();
var sID = res.request.headers["Speech-Id"];
var passLvl1 = false;
var passLvl2 = false;
/* Cleaning the texts */
text = text.replace(/\r\n/g, " ")
.replace(/�/g, "'")
.replace(/\s\s+/g, " ");
/* Check for this authenticity badge for easy clipping of text */
if (text.indexOf(startClip) >= 0 && !passLvl1) {
console.log("Got auth text");
text = text.split(startClip)[1];
passLvl1 = true;
}
if (text.indexOf(startClip2) >= 0 && !passLvl1) {
console.log("Got pdf text");
text = text.split(startClip2)[1];
passLvl1 = true;
}
if (passLvl1) {
/* "Book/CDs" text help to end the clipping */
if (text.indexOf(endClip) >= 0 && !passLvl2) {
console.log('Got Book/CDs text');
text = text.split(endClip)[0];
passLvl2 = true;
}
if (text.indexOf(endClip2) >= 0 && !passLvl2) {
console.log('Got copyright status text. You may need to check speech' + sID + '.txt');
text = text.split(endClip2)[0];
passLvl2 = true;
}
if (passLvl2) {
console.log("Writing job file...");
fs.writeFile(dlFolder + "speech" + sID + ".txt", text.trim(), function(err) {
if (err) throw err;
})
} else {
console.log("Failed to restructure file")
console.log(res.request.path);
}
}
}
} catch (error) {
console.log("Failed to process speech " + speech['id'] + ". Error: " + error)
}
});
}
});