-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathttf.py
executable file
·205 lines (165 loc) · 6.1 KB
/
ttf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/usr/bin/env python3
"""
Copied text from a PDF and pasted random symbols?
PDF files are sometimes purposely protected from copying,
but this often unintentionally prevents distribution and
findability of knowledge.
Another reason to embed fonts is to protect the font designs
themselves from copying.
This program is trying to recover that text and lost knowledge.
It recovers text in HTML produced by pdf2htmlEX from PDFs
where characters are broken due to embedded fonts
with bad character maps (CMAPs).
This allows to search and copy/paste the text.
The program works by comparing glyph shapes of the embedded
fonts with known fonts, so it is very helpful if the fonts
used in the PDF document are known and their full version
is available. This allows fully automatic repair of information.
If the fonts are unknown, unavailable, or glyphs can't be recognized,
program will ask the user to recognize the letter shape
and key in the right symbol.
It will only ask once for each shape and remember the letter choice
in a human-readable dictionary (dictionary.json).
The technical reason for random symbols:
Seemingly random characters are produced when you copy/paste text from PDF
because the PDF embedded fonts don't use standard unicode character code maps.
They use Private Use Area unicode range for mapping the glyph indices to codes.
"""
from lxml.html import tostring
import string
import glob
import json
try:
from freetype import Face, FT_LOAD_RENDER, FT_LOAD_TARGET_MONO
except ImportError:
print('Requires: pip3 install freetype-py')
try:
from pdftranscript.config import FULL_FONTS_PATH
except ImportError:
FULL_FONTS_PATH = './fonts'
DEBUG = 1
def pua_content(txt):
"""Ratio of characters encoded using Private Use Area (PUA) E000—F8FF.
PUA is used by PDF embedded fonts if original CMAP was thrown away."""
return len([1 for x in txt if 0xE000 <= ord(x) <= 0xF8FF]) / float(len(txt))
def bits(x):
data = []
for _i in range(8):
data.insert(0, int((x & 1) == 1))
x = x >> 1
return data
def show_glyph(data, bitmap, draw=True):
"""Render glyph on the CLI using TEXT art"""
w = ''.join(['█ ' if px else ' ' for px in data])
ls = []
s = ''
for index, e in enumerate(w):
if (index + 1) % (bitmap.width * 2) == 0:
ls.append(s)
s = ''
else:
s += e
return ls
def glyph_data(face, char):
face.set_char_size(32 * 48) # 24*32, 32*48, 48*64
face.load_char(char, FT_LOAD_RENDER | FT_LOAD_TARGET_MONO)
bitmap = face.glyph.bitmap
# width = face.glyph.bitmap.width
# rows = face.glyph.bitmap.rows
# pitch = face.glyph.bitmap.pitch
data = []
for i in range(bitmap.rows):
row = []
for j in range(bitmap.pitch):
row.extend(bits(bitmap.buffer[i * bitmap.pitch + j]))
data.extend(row[: bitmap.width])
return data, bitmap
def load_fonts(path):
# TODO: WOFF handling
fonts = glob.glob(path + '/*.ttf') # + glob.glob(path+'/*.woff')
fonts = {x.split('/')[-1].replace('.ttf', ''): Face(x) for x in fonts}
if DEBUG:
print('Loading fonts from: ' + path)
for face in fonts.values():
print(face.family_name.decode(), face.style_name.decode(), face.num_glyphs, 'glyphs')
return fonts
def char_lookup(fonts):
chars = string.printable + "£©¹’'‘’“”"
ls = []
for _name, font in fonts.items():
for char in chars:
data, bitmap = glyph_data(font, char)
ls.append((str(data), char))
return dict(ls)
def lookup_user(data, bitmap):
dictionary = 'dictionary.json'
try:
lookup = json.load(open(dictionary, 'r'))
except ValueError: # dictionary was empty
lookup = []
shape = show_glyph(data, bitmap)
try: # lookup shape in our dictionary
return [c for c, s in lookup if s == shape][0]
except IndexError: # No known character - ask for input
for line in shape:
print(line)
print('\a')
char = input('Please enter character shown: ')
print('you entered: ', char)
lookup.append((char, shape))
lookup = sorted(lookup, key=lambda x: x[0])
json.dump(lookup, open(dictionary, 'w+'), indent=1, ensure_ascii=False)
return char
LOOKUP_FONTS = char_lookup(load_fonts(FULL_FONTS_PATH))
def decode_font(code, font, embed_fonts):
word = ''
for codepoint in code:
data, bitmap = glyph_data(embed_fonts[font], codepoint)
try:
char = LOOKUP_FONTS[str(data)]
except KeyError:
char = lookup_user(data, bitmap)
word += char
# print(font, len(code), word)
return word
def font_family(e):
def fn(e):
if e is None:
return
css = e.get('class', '')
if css.startswith('ff'):
return css[1:3]
try:
return 'f' + css.split(' ff')[1][0]
except IndexError:
return
ancestors = [e]
if e is not None:
ancestors += [x for x in e.iterancestors()]
for w in ancestors:
f = fn(w)
if f:
return f
return 'f1'
def recover_text(dom, embed_fonts_path):
embed_fonts = load_fonts(embed_fonts_path)
for e in dom.iter():
text_ff = font_family(e)
tail_ff = font_family(e.getparent())
def decode(txt, font):
return decode_font(txt, font, embed_fonts)
# element text and tail(txt following el) can be different font-family
# only decode text its font-family is embedded font
if e.text and e.text != ' ' and text_ff in embed_fonts.keys():
e.text = decode(e.text, text_ff)
if e.tail and e.tail is not None and tail_ff in embed_fonts.keys():
e.tail = decode(e.tail, tail_ff)
if __name__ == '__main__':
from pdftranscript import transcript, config
import os.path
doc_path = config.HTML_DIR + '/100026_945655/100026_945655.html'
dom, css = transcript.prepare(doc_path)
recover_text(dom, os.path.dirname(doc_path))
f = open(doc_path.replace('.html', '.htm'), 'wb+')
f.write(tostring(dom))
f.close()