-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgitbook.js
192 lines (162 loc) · 6.58 KB
/
gitbook.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
const { chromium, devices } = require('playwright');
/** legacy config, for old gitbook */
const SITE_CONFIG = {
// for step 1, get all chapter links
chapterLinksElmSelector: '.summary li.chapter>a',
// for step 2, fetch all chapters html content
// bodySelector is optional, just for beautify
bodySelector: '.book-body',
// bookContentSelector is important, it's the container of each chapter content
bookContentSelector: '#book-search-results',
// for step 3, beautify page, remove header and sidebar menu etc.
headerSelector: '.book-header',
navNextSelector: '.navigation-next',
sideBarSelector: '.book-summary',
}
/**
* url: 'https://docs.facefusion.io/',
* bookName: 'FaceFusion.pdf',
*/
// const SITE_CONFIG = {
// chapterLinksElmSelector: 'body > div > div > aside > div > ul a',
// bookContentSelector: 'main',
// headerSelector: 'header',
// }
class GitBookPDFSpider {
constructor({browser, page, pageConfig, url, bookName}) {
this._browser = browser;
this._mainPage = page;
this._pageConfig = pageConfig;
this._bookUrl = url;
this._bookName = bookName;
}
static async create({url, bookName, isMobile = true}) {
const browser = await chromium.launch();
// const pageConfig = isMobile ? devices['iPhone 14'] : devices['Desktop Chrome'];
const pageConfig = {};
const page = await browser.newPage();
return new GitBookPDFSpider({browser, page, pageConfig, url, bookName});
}
async run() {
await this._openMainPage();
const chaptersMetaInfo = await this._getChaptersMetaInfo();
console.log('chaptersMetaInfo', chaptersMetaInfo);
const chaptersHTMLContent = await this._fetchAllChaptersHTMLContent(chaptersMetaInfo);
const pdfMenuHTML = await this._generatePDFMenuHTML(chaptersMetaInfo);
await this._beautifyMainPage(chaptersMetaInfo);
await this._generateFullHTMLPage(chaptersHTMLContent, pdfMenuHTML);
await this._mainPage.pdf({ path: this._bookName, format: 'A4' });
await this._browser.close();
}
_openMainPage = async () => {
console.log('open url', this._bookUrl);
await this._mainPage.goto(this._bookUrl);
await this._mainPage.waitForLoadState('domcontentloaded');
}
_generatePDFMenuHTML = async (chaptersMetaInfo = []) => {
return this._mainPage.evaluate(({chaptersMetaInfo = [], SITE_CONFIG}) => {
const pdfMenu = document.createElement('div');
pdfMenu.style.fontSize = '16px';
pdfMenu.style.padding = '2px 48px';
pdfMenu.style.marginBottom = '600px';
chaptersMetaInfo.forEach((chapter, index) => {
const {title, id} = chapter;
const chapterLink = document.createElement('a');
chapterLink.textContent = `${index}. ${title}`;
const chapterLinkContainer = document.createElement('div');
chapterLinkContainer.style.margin = '4px 18px';
chapterLinkContainer.appendChild(chapterLink);
pdfMenu.appendChild(chapterLinkContainer);
});
return pdfMenu.innerHTML;
}, {chaptersMetaInfo, SITE_CONFIG});
}
// hide left menu; add pdf chapter link
_beautifyMainPage = async (chaptersMetaInfo = []) => {
console.log('beautify MainPage...')
await this._mainPage.evaluate(({chaptersMetaInfo = [], SITE_CONFIG}) => {
// for gitbook
const bodyElm = document.querySelector(SITE_CONFIG.bodySelector);
if (bodyElm) {
bodyElm.style.position = 'static';
}
// for rustbook
document.documentElement.style.setProperty('--sidebar-width', '0');
const sideBarElm = document.querySelector(SITE_CONFIG.sideBarSelector);
sideBarElm && sideBarElm.remove();
const headerElm = document.querySelector(SITE_CONFIG.headerSelector);
headerElm && headerElm.remove();
const navNextElm = document.querySelector(SITE_CONFIG.navNextSelector);
navNextElm && navNextElm.remove();
}, {chaptersMetaInfo, SITE_CONFIG});
}
// {url, title, id}
_getChaptersMetaInfo = async () => {
console.log('get Chapters MetaInfo');
return this._mainPage.evaluate((SITE_CONFIG) => {
const res = [];
const linksElm = document.querySelectorAll(SITE_CONFIG.chapterLinksElmSelector);
if (!linksElm) {
throw new Error('Can not find chapter links, try to modify SITE_CONFIG.chapterLinksElmSelector');
}
linksElm.forEach((link, index) => {
link.href && res.push({
url: link.href,
title: link.textContent.trim() || 'UnTitled',
id: `pdfchapter_${index}`
});
});
return res;
}, SITE_CONFIG);
}
_fetchAllChaptersHTMLContent = async (chaptersMetaInfo = []) => {
const chaptersContents = [];
if (chaptersMetaInfo.length) {
const newPage = await this._browser.newPage();
// const testPages = chaptersMetaInfo.slice(1, 3);
// for (let {url} of testPages) {
for (let {url} of chaptersMetaInfo) {
const contentElm = await this._openURLAndPickHTMLStr(newPage, url);
contentElm && chaptersContents.push(contentElm);
}
await newPage.close();
}
return chaptersContents;
}
_openURLAndPickHTMLStr = async (page, url) => {
console.log('open sub page', url);
await page.goto(url);
await page.waitForLoadState('domcontentloaded');
await page.evaluate((SITE_CONFIG) => {
const bodyElm = document.querySelector(SITE_CONFIG.bodySelector);
if (bodyElm) {
bodyElm.style.position = 'relative';
}
}, SITE_CONFIG);
const bookContent = await page.$(SITE_CONFIG.bookContentSelector);
const bookContentHTML = bookContent ? await page.evaluate((bookContent) => bookContent.outerHTML, bookContent) : null;
// console.log(bookContentHTML)
return bookContentHTML;
}
_generateFullHTMLPage = async (chaptersHTMLContent = [], pdfMenuHTML = '') => {
if (!chaptersHTMLContent.length) return;
await this._mainPage.evaluate(({chaptersHTMLContent, pdfMenuHTML, SITE_CONFIG}) => {
const bodyElm = document.querySelector(SITE_CONFIG.bookContentSelector);
bodyElm.innerHTML = '';
bodyElm.appendChild(document.createElement('div')).innerHTML = pdfMenuHTML;
chaptersHTMLContent.forEach((htmlStr) => {
const container = document.createElement('div');
container.innerHTML = htmlStr;
container.style.marginTop = '800px';
container.style.paddingTop = '40px';
bodyElm.appendChild(container);
});
}, {chaptersHTMLContent, pdfMenuHTML, SITE_CONFIG});
}
}
GitBookPDFSpider.create({
url: 'https://braydie.gitbooks.io/how-to-be-a-programmer/content/en/',
bookName: 'How to be a programer.pdf',
}).then(spider => {
spider.run();
});