Impl CMap parsing

fschutt · Feb 27, 2025 · 16ba6a6 · 16ba6a6
1 parent 597b5c1
commit 16ba6a6
Show file tree

Hide file tree

Showing 3 changed files with 323 additions and 0 deletions.
diff --git a/src/cmap.rs b/src/cmap.rs
@@ -0,0 +1,155 @@
+/// ToUnicode CMap parsing
+
+use std::collections::BTreeMap;
+use lopdf::{Dictionary, Document, Object};
+use crate::text::CMap;
+
+/// The mapping from a CID to one or more Unicode code points.
+#[derive(Debug)]
+pub struct ToUnicodeCMap {
+    pub mappings: BTreeMap<u32, Vec<u32>>,
+}
+
+impl ToUnicodeCMap {
+    /// Parses a ToUnicode CMap from the given input string.
+    pub fn parse(input: &str) -> Result<ToUnicodeCMap, String> {
+        let mut mappings = BTreeMap::new();
+        let mut lines = input.lines().map(|l| l.trim()).filter(|l| !l.is_empty());
+        while let Some(line) = lines.next() {
+            if line.starts_with("beginbfchar") {
+                // Process each bfchar mapping line until "endbfchar"
+                while let Some(l) = lines.next() {
+                    if l.starts_with("endbfchar") {
+                        break;
+                    }
+                    // Expect a line like: "<0041> <0041>"
+                    let tokens: Vec<&str> = l.split_whitespace().collect();
+                    if tokens.len() < 2 {
+                        continue; // skip bad lines
+                    }
+                    let cid = parse_hex_token(tokens[0])?;
+                    let uni = parse_hex_token(tokens[1])?;
+                    mappings.insert(cid, vec![uni]);
+                }
+            } else if line.starts_with("beginbfrange") {
+                // Process each bfrange mapping line until "endbfrange"
+                while let Some(l) = lines.next() {
+                    if l.starts_with("endbfrange") {
+                        break;
+                    }
+                    // There are two forms:
+                    //   form1: <start> <end> <startUnicode>
+                    //   form2: <start> <end> [ <unicode1> <unicode2> ... ]
+                    let tokens: Vec<&str> = l.split_whitespace().collect();
+                    if tokens.len() < 3 {
+                        continue;
+                    }
+                    let start = parse_hex_token(tokens[0])?;
+                    let end = parse_hex_token(tokens[1])?;
+                    if tokens[2].starts_with('[') {
+                        // form2: rebuild the array of tokens.
+                        let mut arr_tokens = Vec::new();
+                        // Remove the leading '[' from the first token.
+                        let first = tokens[2].trim_start_matches('[');
+                        arr_tokens.push(first);
+                        // Process the rest tokens until one ends with ']'.
+                        for token in tokens.iter().skip(3) {
+                            if token.ends_with(']') {
+                                arr_tokens.push(token.trim_end_matches(']'));
+                                break;
+                            } else {
+                                arr_tokens.push(token);
+                            }
+                        }
+                        let expected = end - start + 1;
+                        if arr_tokens.len() != expected as usize {
+                            return Err(format!(
+                                "bfrange array length mismatch: expected {} but got {}",
+                                expected,
+                                arr_tokens.len()
+                            ));
+                        }
+                        for (i, token) in arr_tokens.iter().enumerate() {
+                            let uni = parse_hex_token(token)?;
+                            mappings.insert(start + i as u32, vec![uni]);
+                        }
+                    } else {
+                        // form1: a single starting unicode value.
+                        let start_uni = parse_hex_token(tokens[2])?;
+                        let mut cur = start_uni;
+                        for cid in start..=end {
+                            mappings.insert(cid, vec![cur]);
+                            cur += 1;
+                        }
+                    }
+                }
+            }
+            // (Other lines, e.g. codespacerange, can be skipped for now.)
+        }
+        Ok(ToUnicodeCMap { mappings })
+    }
+}
+
+/// Helper: Parse a hex token of the form "<...>" and return the number.
+fn parse_hex_token(token: &str) -> Result<u32, String> {
+    let token = token.trim();
+    if token.len() < 2 {
+        return Err("Hex token too short".into());
+    }
+    if token.starts_with('<') && token.ends_with('>') {
+        let inner = &token[1..token.len()-1];
+        u32::from_str_radix(inner, 16)
+            .map_err(|e| format!("Failed to parse hex token {}: {}", token, e))
+    } else {
+        Err(format!("Expected token enclosed in <>: {}", token))
+    }
+}
+
+/// Implement the CMap trait on our ToUnicodeCMap.
+impl CMap for ToUnicodeCMap {
+    fn map_bytes(&self, bytes: &[u8]) -> String {
+        // For simplicity, assume that the byte sequence represents CIDs in big-endian,
+        // and that each CID is 2 bytes long.
+        let mut result = String::new();
+        let mut i = 0;
+        while i + 1 < bytes.len() {
+            let cid = u16::from_be_bytes([bytes[i], bytes[i+1]]) as u32;
+            if let Some(unis) = self.mappings.get(&cid) {
+                for &u in unis {
+                    if let Some(ch) = std::char::from_u32(u) {
+                        result.push(ch);
+                    }
+                }
+            }
+            i += 2;
+        }
+        result
+    }
+}
+
+/// Looks for a `ToUnicode` CMap entry on the dictionary, resolves it to a dictionary
+/// and parses the `ToUnicodeCMap`.
+pub fn get_to_unicode_cmap_from_font(
+    font_dict: &Dictionary,
+    doc: &Document,
+) -> Result<ToUnicodeCMap, String> {
+
+    let to_unicode_obj = font_dict.get(b"ToUnicode").ok()
+    .ok_or("No ToUnicode entry found")?;
+
+    let stream = match to_unicode_obj {
+        Object::Reference(r) => doc.get_object(*r)
+            .and_then(|obj| obj.as_stream().map(|s| s.clone()))
+            .map_err(|e| format!("Error getting ToUnicode stream: {}", e))?,
+        Object::Stream(s) => s.clone(),
+        _ => return Err("Unexpected type for ToUnicode entry".into()),
+    };
+
+    let content = stream.decompressed_content()
+        .map_err(|e| format!("Decompress error: {}", e))?;
+
+    let cmap_str = String::from_utf8(content)
+        .map_err(|e| format!("UTF-8 conversion error: {}", e))?;
+
+    ToUnicodeCMap::parse(&cmap_str)
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -7,6 +7,7 @@ use serde_derive::{Deserialize, Serialize};
 /// Link / bookmark annotation handling
 pub mod annotation;
 pub mod text;
+pub mod cmap;
 pub mod wasm;
 pub use annotation::*;
 /// PDF standard handling

diff --git a/src/text.rs b/src/text.rs
@@ -1 +1,168 @@
+/// PDF Text decoding / encoding and ToUnicode handling
 
+use lopdf::{Object, StringFormat};
+
+/// Represents a text segment (decoded as a UTF-8 String) or a spacing adjustment
+#[derive(Debug, Clone)]
+pub enum TextItem {
+    Text(String),    // A segment of text
+    Offset(i32),     // A spacing adjustment (in thousandths of an em)
+}
+
+/// A trait for mapping raw byte sequences to Unicode using a ToUnicode CMap.
+/// (In a full implementation, this would use the actual mapping defined in the PDF.)
+pub trait CMap {
+    fn map_bytes(&self, bytes: &[u8]) -> String;
+}
+
+/// Decode a PDF string (literal or hexadecimal) into a Rust UTF‑8 String.
+/// If a ToUnicode CMap is provided, use it to map the raw bytes; otherwise, fallback
+/// to assuming the bytes are encoded in WinAnsi (or UTF‑8 when possible).
+pub fn decode_pdf_string(obj: &Object, to_unicode: Option<&impl CMap>) -> String {
+    if let Object::String(ref bytes, format) = obj {
+        match format {
+            StringFormat::Literal => {
+                // Here you should process escape sequences (\, \(, \), octal codes, etc.).
+                // For simplicity, we assume the provided bytes are already unescaped.
+                if let Some(cmap) = to_unicode {
+                    cmap.map_bytes(bytes)
+                } else {
+                    String::from_utf8_lossy(bytes).into_owned()
+                }
+            },
+            StringFormat::Hexadecimal => {
+                // For hex strings the bytes are the raw binary data.
+                if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
+                    // Contains a BOM – assume UTF-16BE.
+                    let utf16_iter = bytes[2..].chunks(2)
+                        .filter_map(|pair| {
+                            if pair.len() == 2 {
+                                Some(u16::from_be_bytes([pair[0], pair[1]]))
+                            } else {
+                                None
+                            }
+                        });
+                    String::from_utf16(&utf16_iter.collect::<Vec<_>>()).unwrap_or_default()
+                } else {
+                    // Without BOM, use the ToUnicode mapping if available, or fallback.
+                    if let Some(cmap) = to_unicode {
+                        cmap.map_bytes(bytes)
+                    } else {
+                        String::from_utf8_lossy(bytes).into_owned()
+                    }
+                }
+            },
+        }
+    } else {
+        String::new()
+    }
+}
+
+/// Given the operands of a TJ operator (an array of PDF objects),
+/// decode them into a Vec<TextItem> where string elements become TextItem::Text
+/// (after decoding) and numbers become TextItem::Offset.
+pub fn decode_tj_operands(operands: &[Object], to_unicode: Option<&impl CMap>) -> Vec<TextItem> {
+    let mut items = Vec::new();
+    for obj in operands {
+        match obj {
+            Object::String(_, _) => {
+                let s = decode_pdf_string(obj, to_unicode);
+                items.push(TextItem::Text(s));
+            },
+            Object::Integer(i) => {
+                items.push(TextItem::Offset(*i as i32));
+            },
+            Object::Real(r) => {
+                items.push(TextItem::Offset(*r as i32));
+            },
+            _ => {
+                // Ignore unsupported types or log a warning.
+            }
+        }
+    }
+    items
+}
+
+/// Encode a Rust string as a PDF literal string.
+/// It surrounds the string with parentheses and escapes special characters.
+pub fn encode_pdf_string_literal(s: &str) -> String {
+    let mut result = String::with_capacity(s.len() + 2);
+    result.push('(');
+    for c in s.chars() {
+        match c {
+            '(' => result.push_str("\\("),
+            ')' => result.push_str("\\)"),
+            '\\' => result.push_str("\\\\"),
+            '\n' => result.push_str("\\n"),
+            '\r' => result.push_str("\\r"),
+            '\t' => result.push_str("\\t"),
+            '\x08' => result.push_str("\\b"),
+            '\x0C' => result.push_str("\\f"),
+            _ => result.push(c),
+        }
+    }
+    result.push(')');
+    result
+}
+
+/// Encode a Rust string as a PDF hex string.
+/// The string is encoded as UTF-16BE with a BOM (0xFEFF) and then output as hex.
+pub fn encode_pdf_string_hex(s: &str) -> String {
+    // Encode as UTF-16BE with BOM.
+    let mut utf16: Vec<u16> = Vec::new();
+    utf16.push(0xFEFF); // BOM
+    utf16.extend(s.encode_utf16());
+    let mut bytes = Vec::with_capacity(utf16.len() * 2);
+    for code in utf16 {
+        bytes.extend_from_slice(&code.to_be_bytes());
+    }
+    let hex: String = bytes.iter().map(|b| format!("{:02X}", b)).collect();
+    format!("<{}>", hex)
+}
+
+/// Given a Rust string, decide whether a literal or hex encoding yields a smaller output.
+/// Returns the PDF string representation.
+pub fn encode_pdf_string_minimal(s: &str) -> String {
+    let literal = encode_pdf_string_literal(s);
+    let hex = encode_pdf_string_hex(s);
+    if literal.len() <= hex.len() {
+        literal
+    } else {
+        hex
+    }
+}
+
+/// Encodes a vector of TextItem into a vector of lopdf::Object suitable for a TJ operator.
+/// Text segments are encoded as PDF strings (choosing the minimal encoding),
+/// and spacing offsets as numbers.
+pub fn encode_text_items(items: &[TextItem]) -> Vec<Object> {
+    let mut objs = Vec::new();
+    for item in items {
+        match item {
+            TextItem::Text(s) => {
+                let pdf_str = encode_pdf_string_minimal(s);
+                // Check if the encoding is hex or literal based on its delimiters.
+                if pdf_str.starts_with('<') {
+                    // For hex, remove the delimiters and convert back to bytes.
+                    let inner = &pdf_str[1..pdf_str.len()-1];
+                    let mut bytes = Vec::new();
+                    for i in (0..inner.len()).step_by(2) {
+                        if i + 2 <= inner.len() {
+                            if let Ok(byte) = u8::from_str_radix(&inner[i..i+2], 16) {
+                                bytes.push(byte);
+                            }
+                        }
+                    }
+                    objs.push(Object::String(bytes, StringFormat::Hexadecimal));
+                } else {
+                    // For literal strings, we assume a UTF-8 encoding.
+                    objs.push(Object::String(s.as_bytes().to_vec(), StringFormat::Literal));
+                }
+            },
+            TextItem::Offset(n) => {
+                objs.push(Object::Integer(*n as i64));
+            },
+        }
+    }
+    objs
+}