-
-
Notifications
You must be signed in to change notification settings - Fork 101
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
323 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
/// ToUnicode CMap parsing | ||
use std::collections::BTreeMap; | ||
use lopdf::{Dictionary, Document, Object}; | ||
use crate::text::CMap; | ||
|
||
/// The mapping from a CID to one or more Unicode code points. | ||
#[derive(Debug)] | ||
pub struct ToUnicodeCMap { | ||
pub mappings: BTreeMap<u32, Vec<u32>>, | ||
} | ||
|
||
impl ToUnicodeCMap { | ||
/// Parses a ToUnicode CMap from the given input string. | ||
pub fn parse(input: &str) -> Result<ToUnicodeCMap, String> { | ||
let mut mappings = BTreeMap::new(); | ||
let mut lines = input.lines().map(|l| l.trim()).filter(|l| !l.is_empty()); | ||
while let Some(line) = lines.next() { | ||
if line.starts_with("beginbfchar") { | ||
// Process each bfchar mapping line until "endbfchar" | ||
while let Some(l) = lines.next() { | ||
if l.starts_with("endbfchar") { | ||
break; | ||
} | ||
// Expect a line like: "<0041> <0041>" | ||
let tokens: Vec<&str> = l.split_whitespace().collect(); | ||
if tokens.len() < 2 { | ||
continue; // skip bad lines | ||
} | ||
let cid = parse_hex_token(tokens[0])?; | ||
let uni = parse_hex_token(tokens[1])?; | ||
mappings.insert(cid, vec![uni]); | ||
} | ||
} else if line.starts_with("beginbfrange") { | ||
// Process each bfrange mapping line until "endbfrange" | ||
while let Some(l) = lines.next() { | ||
if l.starts_with("endbfrange") { | ||
break; | ||
} | ||
// There are two forms: | ||
// form1: <start> <end> <startUnicode> | ||
// form2: <start> <end> [ <unicode1> <unicode2> ... ] | ||
let tokens: Vec<&str> = l.split_whitespace().collect(); | ||
if tokens.len() < 3 { | ||
continue; | ||
} | ||
let start = parse_hex_token(tokens[0])?; | ||
let end = parse_hex_token(tokens[1])?; | ||
if tokens[2].starts_with('[') { | ||
// form2: rebuild the array of tokens. | ||
let mut arr_tokens = Vec::new(); | ||
// Remove the leading '[' from the first token. | ||
let first = tokens[2].trim_start_matches('['); | ||
arr_tokens.push(first); | ||
// Process the rest tokens until one ends with ']'. | ||
for token in tokens.iter().skip(3) { | ||
if token.ends_with(']') { | ||
arr_tokens.push(token.trim_end_matches(']')); | ||
break; | ||
} else { | ||
arr_tokens.push(token); | ||
} | ||
} | ||
let expected = end - start + 1; | ||
if arr_tokens.len() != expected as usize { | ||
return Err(format!( | ||
"bfrange array length mismatch: expected {} but got {}", | ||
expected, | ||
arr_tokens.len() | ||
)); | ||
} | ||
for (i, token) in arr_tokens.iter().enumerate() { | ||
let uni = parse_hex_token(token)?; | ||
mappings.insert(start + i as u32, vec![uni]); | ||
} | ||
} else { | ||
// form1: a single starting unicode value. | ||
let start_uni = parse_hex_token(tokens[2])?; | ||
let mut cur = start_uni; | ||
for cid in start..=end { | ||
mappings.insert(cid, vec![cur]); | ||
cur += 1; | ||
} | ||
} | ||
} | ||
} | ||
// (Other lines, e.g. codespacerange, can be skipped for now.) | ||
} | ||
Ok(ToUnicodeCMap { mappings }) | ||
} | ||
} | ||
|
||
/// Helper: Parse a hex token of the form "<...>" and return the number. | ||
fn parse_hex_token(token: &str) -> Result<u32, String> { | ||
let token = token.trim(); | ||
if token.len() < 2 { | ||
return Err("Hex token too short".into()); | ||
} | ||
if token.starts_with('<') && token.ends_with('>') { | ||
let inner = &token[1..token.len()-1]; | ||
u32::from_str_radix(inner, 16) | ||
.map_err(|e| format!("Failed to parse hex token {}: {}", token, e)) | ||
} else { | ||
Err(format!("Expected token enclosed in <>: {}", token)) | ||
} | ||
} | ||
|
||
/// Implement the CMap trait on our ToUnicodeCMap. | ||
impl CMap for ToUnicodeCMap { | ||
fn map_bytes(&self, bytes: &[u8]) -> String { | ||
// For simplicity, assume that the byte sequence represents CIDs in big-endian, | ||
// and that each CID is 2 bytes long. | ||
let mut result = String::new(); | ||
let mut i = 0; | ||
while i + 1 < bytes.len() { | ||
let cid = u16::from_be_bytes([bytes[i], bytes[i+1]]) as u32; | ||
if let Some(unis) = self.mappings.get(&cid) { | ||
for &u in unis { | ||
if let Some(ch) = std::char::from_u32(u) { | ||
result.push(ch); | ||
} | ||
} | ||
} | ||
i += 2; | ||
} | ||
result | ||
} | ||
} | ||
|
||
/// Looks for a `ToUnicode` CMap entry on the dictionary, resolves it to a dictionary | ||
/// and parses the `ToUnicodeCMap`. | ||
pub fn get_to_unicode_cmap_from_font( | ||
font_dict: &Dictionary, | ||
doc: &Document, | ||
) -> Result<ToUnicodeCMap, String> { | ||
|
||
let to_unicode_obj = font_dict.get(b"ToUnicode").ok() | ||
.ok_or("No ToUnicode entry found")?; | ||
|
||
let stream = match to_unicode_obj { | ||
Object::Reference(r) => doc.get_object(*r) | ||
.and_then(|obj| obj.as_stream().map(|s| s.clone())) | ||
.map_err(|e| format!("Error getting ToUnicode stream: {}", e))?, | ||
Object::Stream(s) => s.clone(), | ||
_ => return Err("Unexpected type for ToUnicode entry".into()), | ||
}; | ||
|
||
let content = stream.decompressed_content() | ||
.map_err(|e| format!("Decompress error: {}", e))?; | ||
|
||
let cmap_str = String::from_utf8(content) | ||
.map_err(|e| format!("UTF-8 conversion error: {}", e))?; | ||
|
||
ToUnicodeCMap::parse(&cmap_str) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,168 @@ | ||
/// PDF Text decoding / encoding and ToUnicode handling | ||
use lopdf::{Object, StringFormat}; | ||
|
||
/// Represents a text segment (decoded as a UTF-8 String) or a spacing adjustment | ||
#[derive(Debug, Clone)] | ||
pub enum TextItem { | ||
Text(String), // A segment of text | ||
Offset(i32), // A spacing adjustment (in thousandths of an em) | ||
} | ||
|
||
/// A trait for mapping raw byte sequences to Unicode using a ToUnicode CMap. | ||
/// (In a full implementation, this would use the actual mapping defined in the PDF.) | ||
pub trait CMap { | ||
fn map_bytes(&self, bytes: &[u8]) -> String; | ||
} | ||
|
||
/// Decode a PDF string (literal or hexadecimal) into a Rust UTF‑8 String. | ||
/// If a ToUnicode CMap is provided, use it to map the raw bytes; otherwise, fallback | ||
/// to assuming the bytes are encoded in WinAnsi (or UTF‑8 when possible). | ||
pub fn decode_pdf_string(obj: &Object, to_unicode: Option<&impl CMap>) -> String { | ||
if let Object::String(ref bytes, format) = obj { | ||
match format { | ||
StringFormat::Literal => { | ||
// Here you should process escape sequences (\, \(, \), octal codes, etc.). | ||
// For simplicity, we assume the provided bytes are already unescaped. | ||
if let Some(cmap) = to_unicode { | ||
cmap.map_bytes(bytes) | ||
} else { | ||
String::from_utf8_lossy(bytes).into_owned() | ||
} | ||
}, | ||
StringFormat::Hexadecimal => { | ||
// For hex strings the bytes are the raw binary data. | ||
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF { | ||
// Contains a BOM – assume UTF-16BE. | ||
let utf16_iter = bytes[2..].chunks(2) | ||
.filter_map(|pair| { | ||
if pair.len() == 2 { | ||
Some(u16::from_be_bytes([pair[0], pair[1]])) | ||
} else { | ||
None | ||
} | ||
}); | ||
String::from_utf16(&utf16_iter.collect::<Vec<_>>()).unwrap_or_default() | ||
} else { | ||
// Without BOM, use the ToUnicode mapping if available, or fallback. | ||
if let Some(cmap) = to_unicode { | ||
cmap.map_bytes(bytes) | ||
} else { | ||
String::from_utf8_lossy(bytes).into_owned() | ||
} | ||
} | ||
}, | ||
} | ||
} else { | ||
String::new() | ||
} | ||
} | ||
|
||
/// Given the operands of a TJ operator (an array of PDF objects), | ||
/// decode them into a Vec<TextItem> where string elements become TextItem::Text | ||
/// (after decoding) and numbers become TextItem::Offset. | ||
pub fn decode_tj_operands(operands: &[Object], to_unicode: Option<&impl CMap>) -> Vec<TextItem> { | ||
let mut items = Vec::new(); | ||
for obj in operands { | ||
match obj { | ||
Object::String(_, _) => { | ||
let s = decode_pdf_string(obj, to_unicode); | ||
items.push(TextItem::Text(s)); | ||
}, | ||
Object::Integer(i) => { | ||
items.push(TextItem::Offset(*i as i32)); | ||
}, | ||
Object::Real(r) => { | ||
items.push(TextItem::Offset(*r as i32)); | ||
}, | ||
_ => { | ||
// Ignore unsupported types or log a warning. | ||
} | ||
} | ||
} | ||
items | ||
} | ||
|
||
/// Encode a Rust string as a PDF literal string. | ||
/// It surrounds the string with parentheses and escapes special characters. | ||
pub fn encode_pdf_string_literal(s: &str) -> String { | ||
let mut result = String::with_capacity(s.len() + 2); | ||
result.push('('); | ||
for c in s.chars() { | ||
match c { | ||
'(' => result.push_str("\\("), | ||
')' => result.push_str("\\)"), | ||
'\\' => result.push_str("\\\\"), | ||
'\n' => result.push_str("\\n"), | ||
'\r' => result.push_str("\\r"), | ||
'\t' => result.push_str("\\t"), | ||
'\x08' => result.push_str("\\b"), | ||
'\x0C' => result.push_str("\\f"), | ||
_ => result.push(c), | ||
} | ||
} | ||
result.push(')'); | ||
result | ||
} | ||
|
||
/// Encode a Rust string as a PDF hex string. | ||
/// The string is encoded as UTF-16BE with a BOM (0xFEFF) and then output as hex. | ||
pub fn encode_pdf_string_hex(s: &str) -> String { | ||
// Encode as UTF-16BE with BOM. | ||
let mut utf16: Vec<u16> = Vec::new(); | ||
utf16.push(0xFEFF); // BOM | ||
utf16.extend(s.encode_utf16()); | ||
let mut bytes = Vec::with_capacity(utf16.len() * 2); | ||
for code in utf16 { | ||
bytes.extend_from_slice(&code.to_be_bytes()); | ||
} | ||
let hex: String = bytes.iter().map(|b| format!("{:02X}", b)).collect(); | ||
format!("<{}>", hex) | ||
} | ||
|
||
/// Given a Rust string, decide whether a literal or hex encoding yields a smaller output. | ||
/// Returns the PDF string representation. | ||
pub fn encode_pdf_string_minimal(s: &str) -> String { | ||
let literal = encode_pdf_string_literal(s); | ||
let hex = encode_pdf_string_hex(s); | ||
if literal.len() <= hex.len() { | ||
literal | ||
} else { | ||
hex | ||
} | ||
} | ||
|
||
/// Encodes a vector of TextItem into a vector of lopdf::Object suitable for a TJ operator. | ||
/// Text segments are encoded as PDF strings (choosing the minimal encoding), | ||
/// and spacing offsets as numbers. | ||
pub fn encode_text_items(items: &[TextItem]) -> Vec<Object> { | ||
let mut objs = Vec::new(); | ||
for item in items { | ||
match item { | ||
TextItem::Text(s) => { | ||
let pdf_str = encode_pdf_string_minimal(s); | ||
// Check if the encoding is hex or literal based on its delimiters. | ||
if pdf_str.starts_with('<') { | ||
// For hex, remove the delimiters and convert back to bytes. | ||
let inner = &pdf_str[1..pdf_str.len()-1]; | ||
let mut bytes = Vec::new(); | ||
for i in (0..inner.len()).step_by(2) { | ||
if i + 2 <= inner.len() { | ||
if let Ok(byte) = u8::from_str_radix(&inner[i..i+2], 16) { | ||
bytes.push(byte); | ||
} | ||
} | ||
} | ||
objs.push(Object::String(bytes, StringFormat::Hexadecimal)); | ||
} else { | ||
// For literal strings, we assume a UTF-8 encoding. | ||
objs.push(Object::String(s.as_bytes().to_vec(), StringFormat::Literal)); | ||
} | ||
}, | ||
TextItem::Offset(n) => { | ||
objs.push(Object::Integer(*n as i64)); | ||
}, | ||
} | ||
} | ||
objs | ||
} |