-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathconfig.py
43 lines (39 loc) · 1.57 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
"""
Set your environment variables in your .env file e.g.:
DATA_DIR=/path/to/PDFtranscript/tests
DOCKER_INSTALL=1
"""
import os
import os.path
from dotenv import load_dotenv
load_dotenv()
# This is your project root, configure your own path.
DATA_DIR = os.getenv('DATA_DIR', 'tests/') # /path/to/your/data/dir
# PDF folder within your project root. PDFs to convert come from here.
PDF_DIR = os.path.join(DATA_DIR, 'PDF')
# HTML folder is where pdf2htmlEX outputs (non-semantic HTML)
# after running `./pdf2html.py`.
HTML_DIR = os.path.join(DATA_DIR, 'HTML')
# used by ttf.py to access full original fonts to compare with the broken ones
FULL_FONTS_PATH = os.getenv('FULL_FONTS_PATH', '/path/to/truetype/fonts/')
DOCKER_INSTALL = bool(int(os.getenv('DOCKER_INSTALL', 0)))
DOCKER_IMG_TAG = os.getenv(
'DOCKER_IMG_TAG', 'pdf2htmlex/pdf2htmlex:0.18.8.rc2-master-20200820-ubuntu-20.04-x86_64'
)
# remove mumbo-jumbo TEXT strings before HTML processing (regexes or text)
REMOVE_BEFORE = (
# r'The Office for Standards.*?www\.ofsted\.gov\.uk',
# r'Any complaints.*?\@ofsted\.gov\.uk',
# r'© Crown copyright 20\d\d',
# r'Inspection grades:.*?inspection terms',
# r'This letter.*?their school\.',
# r'You can use Parent View.*?www\.ofsted\.gov\.uk'
)
# find and replace after HTML processing finished
REPLACE_AFTER = (
# (r'td>( )?Overall effectiveness j',
# 'td colspan=4>Overall effectiveness j'),
)
# Additional bullet point characters to be expected at start of line for <li>
# Copied out of the processed PDF. Common bullets are pre-programmed.
BULLETS = ('', '')