-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathPdfTextPlugin.php
167 lines (155 loc) · 4.63 KB
/
PdfTextPlugin.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
<?php
/**
* PDF Text
*
* @copyright Copyright 2007-2012 Roy Rosenzweig Center for History and New Media
* @license http://www.gnu.org/licenses/gpl-3.0.txt GNU GPLv3
*/
/**
* The PDF Text plugin.
*
* @package Omeka\Plugins\PdfText
*/
class PdfTextPlugin extends Omeka_Plugin_AbstractPlugin
{
const ELEMENT_SET_NAME = 'PDF Text';
const ELEMENT_NAME = 'Text';
protected $_hooks = array(
'install',
'uninstall',
'initialize',
'config_form',
'config',
'before_save_file',
);
protected $_pdfMimeTypes = array(
'application/pdf',
'application/x-pdf',
'application/acrobat',
'text/x-pdf',
'text/pdf',
'applications/vnd.pdf',
);
/**
* Install the plugin.
*/
public function hookInstall()
{
// Don't install if the pdftotext command doesn't exist.
// See: http://stackoverflow.com/questions/592620/check-if-a-program-exists-from-a-bash-script
if ((int) shell_exec('hash pdftotext 2>&- || echo 1')) {
throw new Omeka_Plugin_Installer_Exception(__('The pdftotext command-line utility '
. 'is not installed. pdftotext must be installed to install this plugin.'));
}
// Don't install if a PDF element set already exists.
if ($this->_db->getTable('ElementSet')->findByName(self::ELEMENT_SET_NAME)) {
throw new Omeka_Plugin_Installer_Exception(__('An element set by the name "%s" already '
. 'exists. You must delete that element set to install this plugin.', self::ELEMENT_SET_NAME));
}
insert_element_set(
array('name' => self::ELEMENT_SET_NAME, 'record_type' => 'File'),
array(array('name' => self::ELEMENT_NAME))
);
}
/**
* Uninstall the plugin
*/
public function hookUninstall()
{
// Delete the PDF element set.
$elementSet = $this->_db->getTable('ElementSet')->findByName(self::ELEMENT_SET_NAME);
if ($elementSet) {
$elementSet->delete();
}
}
/**
* Initialize this plugin.
*/
public function hookInitialize()
{
// Add translation.
add_translation_source(dirname(__FILE__) . '/languages');
}
/**
* Display the config form.
*/
public function hookConfigForm()
{
echo get_view()->partial(
'plugins/pdf-text-config-form.php',
array('valid_storage_adapter' => $this->isValidStorageAdapter())
);
}
/**
* Handle the config form.
*/
public function hookConfig()
{
// Run the text extraction process if directed to do so.
if ($_POST['pdf_text_process'] && $this->isValidStorageAdapter()) {
Zend_Registry::get('bootstrap')->getResource('jobs')
->sendLongRunning('PdfTextProcess');
}
}
/**
* Add the PDF text to the file record.
*
* This has a secondary effect of including the text in the search index.
*/
public function hookBeforeSaveFile($args)
{
// Extract text only on file insert.
if (!$args['insert']) {
return;
}
$file = $args['record'];
// Ignore non-PDF files.
if (!in_array($file->mime_type, $this->_pdfMimeTypes)) {
return;
}
// Add the PDF text to the file record.
$element = $file->getElement(self::ELEMENT_SET_NAME, self::ELEMENT_NAME);
$text = $this->pdfToText($file->getPath());
// pdftotext must return a string to be saved to the element_texts table.
if (is_string($text)) {
$file->addTextForElement($element, $text);
}
}
/**
* Extract the text from a PDF file.
*
* @param string $path
* @return string
*/
public function pdfToText($path)
{
$path = escapeshellarg($path);
return shell_exec("pdftotext -enc UTF-8 $path -");
}
/**
* Determine if the plugin supports the storage adapter.
*
* pdftotext cannot be used on remote files, so only support the default
* Filesystem adapter, which stores files locally.
*
* @return bool
*/
public function isValidStorageAdapter()
{
$storageAdapter = Zend_Registry::get('bootstrap')
->getResource('storage')->getAdapter();
if (!($storageAdapter instanceof Omeka_Storage_Adapter_Filesystem)) {
return false;
}
return true;
}
/**
* Get the PDF MIME types.
*
* @return array
*/
public function getPdfMimeTypes()
{
return $this->_pdfMimeTypes;
}
}