This repository has been archived by the owner on Jul 19, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPDFProcessor.py
125 lines (88 loc) · 3.22 KB
/
PDFProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import wand.image
import wand.display
import wand.color
import PIL.Image
import io
from typing import List, Union
import numpy as np
def convert_pdf(file: bytes, resolution=50)-> List[bytes]:
"""
Convert a PDF into lihttp://docs.wand-py.org/en/0.4.1/wand/display.htmlst of images in bytes format.
The function removes the alpha channel from the image and
replaces it with a white background.
"""
all_pages = wand.image.Image(blob=file, resolution=resolution)
pages = list()
for i, page in enumerate(all_pages.sequence):
with wand.image.Image(page) as img:
img.format = 'png'
img.background_color = wand.color.Color('white')
img.alpha_channel = 'remove'
pages.append(img.make_blob())
return pages
def unify_pages(pages: List[bytes])-> bytes:
parss = [page_to_array(pg) for pg in pages]
npars = [without_padding(pg) for pg in parss]
return array_to_page(np.concatenate(npars))
def page_to_array(page: bytes)-> np.ndarray:
"""
takes bytes image and converts it through PIL library to numpy array
"""
pimg = PIL.Image.open(io.BytesIO(page))
return np.asarray(pimg)
def array_to_page(array: np.ndarray)->bytes:
"""
does exact opposite of page_to_array
"""
img = PIL.Image.fromarray(np.uint8(array))
faux_file = io.BytesIO()
img.save(faux_file, 'png')
return faux_file.getvalue()
def without_padding(arr: np.ndarray)->np.ndarray:
"""
finds lines closest to sides, that aren't PURELY white and cuts all the whites
:param arr: array with white blocks on sides
:return : modified array
"""
cx = 0 # leftmost white line
cy = 0 # topmost white line
mx = arr.shape[0]-1 # rightmost white line
my = arr.shape[1]-1 # downmost white line
for x in range(mx):
if arr[x].mean() != 255:
cx = x-1
break
for y in range(my):
if arr[..., y].mean() != 255:
cy = y-1
break
for xx in range(mx, cx, -1):
if arr[xx].mean() != 255:
mx = xx+1
break
for yy in range(my, cy, -1):
if arr[..., yy].mean() != 255:
my = yy+1
break
block = divide_block(arr, 'h', [cx, mx])[1:2]
nar = np.uint8(np.concatenate(tuple(block)))
block = divide_block(nar, 'v', [cy, my])[1:2]
mar = np.uint8(np.concatenate(tuple(block)))
return mar
def divide_block(arr: np.ndarray, rot: str, lines:Union[list, tuple])-> List[np.ndarray]:
block = list()
# find all horizontal lines
block.append(arr[0:lines[0]]) # add first block
lines.append(arr.shape[1]) # add last block
for idx, itm in enumerate(lines[:-1]):
if rot == 'h':
block.append((arr[itm:lines[idx+1]]))
else:
block.append((arr[..., itm:lines[idx + 1]]))
return block
if __name__ == '__main__':
png = convert_pdf(open('./sups/17092018/f5f09ebb5307b773b770c7bd358b1e299ae0aa325400884ddd7fcf878fef6790.pdf', 'rb').read())[0]
ar = page_to_array(png)
ar.setflags(write=True)
print(ar.shape)
wand.display.display(wand.image.Image(blob=array_to_page(without_padding(ar))))