-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsex_vampires.py
150 lines (114 loc) · 4.39 KB
/
sex_vampires.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
"""
Better tipue search
"""
import os.path
import re
import json
from bs4 import BeautifulSoup
from codecs import open
try:
from urlparse import urljoin
except ImportError:
from urllib.parse import urljoin
from pelican import signals
from pelican.generators import CachingGenerator
def unTypography(string):
ret = string
# Uncaught whitespace
ret = re.sub(r"[\n\r]+", "\n", ret)
# Large replacements
ret = ret.replace("^", '^')
ret = ret.replace(' ', ' ')
ret = ret.replace("“ ", '"')
# Group replacements
ret = re.sub(r"[“”]", '"', ret)
ret = re.sub(r"[‘’]", "'", ret)
# Single character replacements
to = " "
fr = "¶"
for (pattern, repl) in [(c, to[i]) for i, c in enumerate(fr)]:
# ret = re.sub(pattern, repl, ret)
ret = ret.replace(pattern, repl)
return ret
class TipuesearchContentGenerator(CachingGenerator):
def __init__(self, context, settings, path, theme, output_path, *null):
self.output_path = output_path
self.context = context
self.siteurl = settings.get('SITEURL')
self.relative_urls = settings.get('RELATIVE_URLS')
self.tpages = settings.get('TEMPLATE_PAGES')
self.output_path = output_path
self.json_nodes = []
def generate_output(self, writer):
# The primary function that gets called.
# Our output is the tipuesearch content body.
path = os.path.join(self.output_path, 'tipuesearch_content.js')
# Gather all the content we can
pages = self.context['pages'] + self.context['articles']
for article in self.context['articles']:
pages += article.translations
# Process raw pages
for srclink in self.context.get('RAW_PAGES_TO_INDEX', []):
self.json_nodes.append(self.nodeFromRawPage(srclink))
# Process template pages
for srclink in self.tpages:
self.json_nodes.append(self.nodeFromRawPage(srclink))
# Process non-template pages
for page in pages:
self.json_nodes.append(self.nodeFromPage(page))
# Make variable object
data = json.dumps({'pages': self.json_nodes}, separators=(',', ':'), ensure_ascii=False, indent=1)
# Dump variable to js file
root_node_js = f'var tipuesearch = {data};'
with open(path, 'w', encoding='utf-8') as fd:
fd.write(root_node_js)
def nodeFromPage(self, page):
# Takes a page or article and creates a search node
# Don't index drafts or other non-published documents
if getattr(page, 'status', 'published') != 'published':
return
soup_title = BeautifulSoup(page.title, 'html.parser')
page_title = unTypography(soup_title.get_text(' ', strip=True))
soup_text = BeautifulSoup(page._content, 'html.parser')
page_text = unTypography(soup_text.get_text(' ', strip=True))
# page_text = ' '.join(page_text.split())
page_category = page.category.name if getattr(page, 'category', 'None') != 'None' else ''
page_url = '.'
if page.url:
page_url = page.url if self.relative_urls else (self.siteurl + '/' + page.url)
node = {
'title': page_title,
'text': page_text,
'tags': page_category,
'url': page_url,
'loc': page_url
}
return node
def nodeFromRawPage(self, srclink):
# Takes a url to a template page and creates a search node
srcfile = open(os.path.join(self.output_path, srclink), encoding='utf-8')
soup = BeautifulSoup(srcfile, 'html.parser')
# Only printable characters
while True:
script = soup.find("script")
if script:
script.extract()
else:
break
page_title = unTypography(soup.title.string) if soup.title is not None else ''
page_text = unTypography(soup.get_text())
# Should set default category?
page_category = 'page'
page_url = urljoin(self.siteurl, srclink)
node = {
'title': page_title,
'text': page_text,
'tags': page_category,
'url': page_url
}
return node
def get_generators(generators):
return TipuesearchContentGenerator
def register():
signals.get_generators.connect(get_generators)