Skip to content

Commit

Permalink
Impl CMap parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
fschutt committed Feb 27, 2025
1 parent 597b5c1 commit 16ba6a6
Show file tree
Hide file tree
Showing 3 changed files with 323 additions and 0 deletions.
155 changes: 155 additions & 0 deletions src/cmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/// ToUnicode CMap parsing
use std::collections::BTreeMap;
use lopdf::{Dictionary, Document, Object};
use crate::text::CMap;

/// The mapping from a CID to one or more Unicode code points.
#[derive(Debug)]
pub struct ToUnicodeCMap {
pub mappings: BTreeMap<u32, Vec<u32>>,
}

impl ToUnicodeCMap {
/// Parses a ToUnicode CMap from the given input string.
pub fn parse(input: &str) -> Result<ToUnicodeCMap, String> {
let mut mappings = BTreeMap::new();
let mut lines = input.lines().map(|l| l.trim()).filter(|l| !l.is_empty());
while let Some(line) = lines.next() {
if line.starts_with("beginbfchar") {
// Process each bfchar mapping line until "endbfchar"
while let Some(l) = lines.next() {
if l.starts_with("endbfchar") {
break;
}
// Expect a line like: "<0041> <0041>"
let tokens: Vec<&str> = l.split_whitespace().collect();
if tokens.len() < 2 {
continue; // skip bad lines
}
let cid = parse_hex_token(tokens[0])?;
let uni = parse_hex_token(tokens[1])?;
mappings.insert(cid, vec![uni]);
}
} else if line.starts_with("beginbfrange") {
// Process each bfrange mapping line until "endbfrange"
while let Some(l) = lines.next() {
if l.starts_with("endbfrange") {
break;
}
// There are two forms:
// form1: <start> <end> <startUnicode>
// form2: <start> <end> [ <unicode1> <unicode2> ... ]
let tokens: Vec<&str> = l.split_whitespace().collect();
if tokens.len() < 3 {
continue;
}
let start = parse_hex_token(tokens[0])?;
let end = parse_hex_token(tokens[1])?;
if tokens[2].starts_with('[') {
// form2: rebuild the array of tokens.
let mut arr_tokens = Vec::new();
// Remove the leading '[' from the first token.
let first = tokens[2].trim_start_matches('[');
arr_tokens.push(first);
// Process the rest tokens until one ends with ']'.
for token in tokens.iter().skip(3) {
if token.ends_with(']') {
arr_tokens.push(token.trim_end_matches(']'));
break;
} else {
arr_tokens.push(token);
}
}
let expected = end - start + 1;
if arr_tokens.len() != expected as usize {
return Err(format!(
"bfrange array length mismatch: expected {} but got {}",
expected,
arr_tokens.len()
));
}
for (i, token) in arr_tokens.iter().enumerate() {
let uni = parse_hex_token(token)?;
mappings.insert(start + i as u32, vec![uni]);
}
} else {
// form1: a single starting unicode value.
let start_uni = parse_hex_token(tokens[2])?;
let mut cur = start_uni;
for cid in start..=end {
mappings.insert(cid, vec![cur]);
cur += 1;
}
}
}
}
// (Other lines, e.g. codespacerange, can be skipped for now.)
}
Ok(ToUnicodeCMap { mappings })
}
}

/// Helper: Parse a hex token of the form "<...>" and return the number.
fn parse_hex_token(token: &str) -> Result<u32, String> {
let token = token.trim();
if token.len() < 2 {
return Err("Hex token too short".into());
}
if token.starts_with('<') && token.ends_with('>') {
let inner = &token[1..token.len()-1];
u32::from_str_radix(inner, 16)
.map_err(|e| format!("Failed to parse hex token {}: {}", token, e))
} else {
Err(format!("Expected token enclosed in <>: {}", token))
}
}

/// Implement the CMap trait on our ToUnicodeCMap.
impl CMap for ToUnicodeCMap {
fn map_bytes(&self, bytes: &[u8]) -> String {
// For simplicity, assume that the byte sequence represents CIDs in big-endian,
// and that each CID is 2 bytes long.
let mut result = String::new();
let mut i = 0;
while i + 1 < bytes.len() {
let cid = u16::from_be_bytes([bytes[i], bytes[i+1]]) as u32;
if let Some(unis) = self.mappings.get(&cid) {
for &u in unis {
if let Some(ch) = std::char::from_u32(u) {
result.push(ch);
}
}
}
i += 2;
}
result
}
}

/// Looks for a `ToUnicode` CMap entry on the dictionary, resolves it to a dictionary
/// and parses the `ToUnicodeCMap`.
pub fn get_to_unicode_cmap_from_font(
font_dict: &Dictionary,
doc: &Document,
) -> Result<ToUnicodeCMap, String> {

let to_unicode_obj = font_dict.get(b"ToUnicode").ok()
.ok_or("No ToUnicode entry found")?;

let stream = match to_unicode_obj {
Object::Reference(r) => doc.get_object(*r)
.and_then(|obj| obj.as_stream().map(|s| s.clone()))
.map_err(|e| format!("Error getting ToUnicode stream: {}", e))?,
Object::Stream(s) => s.clone(),
_ => return Err("Unexpected type for ToUnicode entry".into()),
};

let content = stream.decompressed_content()
.map_err(|e| format!("Decompress error: {}", e))?;

let cmap_str = String::from_utf8(content)
.map_err(|e| format!("UTF-8 conversion error: {}", e))?;

ToUnicodeCMap::parse(&cmap_str)
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use serde_derive::{Deserialize, Serialize};
/// Link / bookmark annotation handling
pub mod annotation;
pub mod text;
pub mod cmap;
pub mod wasm;
pub use annotation::*;
/// PDF standard handling
Expand Down
167 changes: 167 additions & 0 deletions src/text.rs
Original file line number Diff line number Diff line change
@@ -1 +1,168 @@
/// PDF Text decoding / encoding and ToUnicode handling
use lopdf::{Object, StringFormat};

/// Represents a text segment (decoded as a UTF-8 String) or a spacing adjustment
#[derive(Debug, Clone)]
pub enum TextItem {
Text(String), // A segment of text
Offset(i32), // A spacing adjustment (in thousandths of an em)
}

/// A trait for mapping raw byte sequences to Unicode using a ToUnicode CMap.
/// (In a full implementation, this would use the actual mapping defined in the PDF.)
pub trait CMap {
fn map_bytes(&self, bytes: &[u8]) -> String;
}

/// Decode a PDF string (literal or hexadecimal) into a Rust UTF‑8 String.
/// If a ToUnicode CMap is provided, use it to map the raw bytes; otherwise, fallback
/// to assuming the bytes are encoded in WinAnsi (or UTF‑8 when possible).
pub fn decode_pdf_string(obj: &Object, to_unicode: Option<&impl CMap>) -> String {
if let Object::String(ref bytes, format) = obj {
match format {
StringFormat::Literal => {
// Here you should process escape sequences (\, \(, \), octal codes, etc.).
// For simplicity, we assume the provided bytes are already unescaped.
if let Some(cmap) = to_unicode {
cmap.map_bytes(bytes)
} else {
String::from_utf8_lossy(bytes).into_owned()
}
},
StringFormat::Hexadecimal => {
// For hex strings the bytes are the raw binary data.
if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
// Contains a BOM – assume UTF-16BE.
let utf16_iter = bytes[2..].chunks(2)
.filter_map(|pair| {
if pair.len() == 2 {
Some(u16::from_be_bytes([pair[0], pair[1]]))
} else {
None
}
});
String::from_utf16(&utf16_iter.collect::<Vec<_>>()).unwrap_or_default()
} else {
// Without BOM, use the ToUnicode mapping if available, or fallback.
if let Some(cmap) = to_unicode {
cmap.map_bytes(bytes)
} else {
String::from_utf8_lossy(bytes).into_owned()
}
}
},
}
} else {
String::new()
}
}

/// Given the operands of a TJ operator (an array of PDF objects),
/// decode them into a Vec<TextItem> where string elements become TextItem::Text
/// (after decoding) and numbers become TextItem::Offset.
pub fn decode_tj_operands(operands: &[Object], to_unicode: Option<&impl CMap>) -> Vec<TextItem> {
let mut items = Vec::new();
for obj in operands {
match obj {
Object::String(_, _) => {
let s = decode_pdf_string(obj, to_unicode);
items.push(TextItem::Text(s));
},
Object::Integer(i) => {
items.push(TextItem::Offset(*i as i32));
},
Object::Real(r) => {
items.push(TextItem::Offset(*r as i32));
},
_ => {
// Ignore unsupported types or log a warning.
}
}
}
items
}

/// Encode a Rust string as a PDF literal string.
/// It surrounds the string with parentheses and escapes special characters.
pub fn encode_pdf_string_literal(s: &str) -> String {
let mut result = String::with_capacity(s.len() + 2);
result.push('(');
for c in s.chars() {
match c {
'(' => result.push_str("\\("),
')' => result.push_str("\\)"),
'\\' => result.push_str("\\\\"),
'\n' => result.push_str("\\n"),
'\r' => result.push_str("\\r"),
'\t' => result.push_str("\\t"),
'\x08' => result.push_str("\\b"),
'\x0C' => result.push_str("\\f"),
_ => result.push(c),
}
}
result.push(')');
result
}

/// Encode a Rust string as a PDF hex string.
/// The string is encoded as UTF-16BE with a BOM (0xFEFF) and then output as hex.
pub fn encode_pdf_string_hex(s: &str) -> String {
// Encode as UTF-16BE with BOM.
let mut utf16: Vec<u16> = Vec::new();
utf16.push(0xFEFF); // BOM
utf16.extend(s.encode_utf16());
let mut bytes = Vec::with_capacity(utf16.len() * 2);
for code in utf16 {
bytes.extend_from_slice(&code.to_be_bytes());
}
let hex: String = bytes.iter().map(|b| format!("{:02X}", b)).collect();
format!("<{}>", hex)
}

/// Given a Rust string, decide whether a literal or hex encoding yields a smaller output.
/// Returns the PDF string representation.
pub fn encode_pdf_string_minimal(s: &str) -> String {
let literal = encode_pdf_string_literal(s);
let hex = encode_pdf_string_hex(s);
if literal.len() <= hex.len() {
literal
} else {
hex
}
}

/// Encodes a vector of TextItem into a vector of lopdf::Object suitable for a TJ operator.
/// Text segments are encoded as PDF strings (choosing the minimal encoding),
/// and spacing offsets as numbers.
pub fn encode_text_items(items: &[TextItem]) -> Vec<Object> {
let mut objs = Vec::new();
for item in items {
match item {
TextItem::Text(s) => {
let pdf_str = encode_pdf_string_minimal(s);
// Check if the encoding is hex or literal based on its delimiters.
if pdf_str.starts_with('<') {
// For hex, remove the delimiters and convert back to bytes.
let inner = &pdf_str[1..pdf_str.len()-1];
let mut bytes = Vec::new();
for i in (0..inner.len()).step_by(2) {
if i + 2 <= inner.len() {
if let Ok(byte) = u8::from_str_radix(&inner[i..i+2], 16) {
bytes.push(byte);
}
}
}
objs.push(Object::String(bytes, StringFormat::Hexadecimal));
} else {
// For literal strings, we assume a UTF-8 encoding.
objs.push(Object::String(s.as_bytes().to_vec(), StringFormat::Literal));
}
},
TextItem::Offset(n) => {
objs.push(Object::Integer(*n as i64));
},
}
}
objs
}

0 comments on commit 16ba6a6

Please sign in to comment.