-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparseCore18.js
49 lines (41 loc) · 1.36 KB
/
parseCore18.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
//@ts-check
"use strict";
var striptags = require('striptags');
let fileToProcess = process.argv[2]; //file
let outFolder = process.argv[3]; //folder
let outFileCounter = 0;
const readline = require('readline');
const fs = require('fs');
const readInterface = readline.createInterface({
input: fs.createReadStream(fileToProcess, {encoding: 'utf8'})
});
let lineCounter = 0;
readInterface.on('line', function(line) {
lineCounter++;
let document = JSON.parse(line);
let indriDoc = "<DOC>\n<DOCNO>" + document.id + "</DOCNO>\n<TEXT>\n";
let contents = document.contents; //array
contents.forEach(function(c){
if(c!=null && c.hasOwnProperty("type")){
if( c.type == 'title' || c.subtype == 'paragraph'){
indriDoc = indriDoc + " " + striptags(c.content);
}
if( c.type == 'image'){
indriDoc = indriDoc + " " + striptags(c.fullcaption);
}
}
})
indriDoc = indriDoc + "</TEXT>\n</DOC>\n";
try {
let outFile = outFolder+"/file"+outFileCounter;
fs.appendFileSync(outFile, indriDoc);
if(lineCounter % 10000 == 0){
console.log("Line "+lineCounter+" written to file.");
outFileCounter++;
}
} catch (err) {
/* Handle the error */
console.log(err);
throw err;
}
});