Skip to content

Commit

Permalink
Merge pull request #233 from jamebal/develop
Browse files Browse the repository at this point in the history
fix: 修复部分pdf文档读取失败的问题
  • Loading branch information
jamebal authored Feb 27, 2025
2 parents d804500 + 55d883a commit d9c4da7
Showing 1 changed file with 35 additions and 20 deletions.
55 changes: 35 additions & 20 deletions src/main/java/com/jmal/clouddisk/lucene/ReadContentService.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package com.jmal.clouddisk.lucene;

import cn.hutool.core.util.StrUtil;
import cn.hutool.core.text.CharSequenceUtil;
import com.jmal.clouddisk.media.VideoProcessService;
import com.jmal.clouddisk.ocr.OcrService;
import com.jmal.clouddisk.service.Constants;
Expand Down Expand Up @@ -71,7 +71,7 @@ public class ReadContentService {
public String dwg2mxweb(File file, String fileId) {
String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath()));
// 生成封面图像
if (StrUtil.isNotBlank(fileId)) {
if (CharSequenceUtil.isNotBlank(fileId)) {
String outputName = file.getName() + Constants.MXWEB_SUFFIX;
FileContentUtil.dwgConvert(file.getAbsolutePath(), videoProcessService.getVideoCacheDir(username, fileId), outputName);
}
Expand All @@ -80,8 +80,14 @@ public String dwg2mxweb(File file, String fileId) {

public static boolean checkPageContent(PDDocument document, int pageIndex) throws IOException {
PDPage page = document.getPage(pageIndex); // 获取页面
// 检查图片内容
if (page == null) {
return false;
}
// 检查是否含有图片
PDResources resources = page.getResources();
if (resources == null) {
return false;
}
for (COSName xObjectName : resources.getXObjectNames()) {
PDXObject xObject = resources.getXObject(xObjectName);
if (xObject instanceof PDImageXObject) {
Expand All @@ -97,7 +103,7 @@ public String readPdfContent(File file, String fileId) {
String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath()));

// 生成封面图像
if (StrUtil.isNotBlank(fileId)) {
if (CharSequenceUtil.isNotBlank(fileId)) {
File coverFile = FileContentUtil.pdfCoverImage(file, document, videoProcessService.getVideoCacheDir(username, fileId));
commonFileService.updateCoverFileDocument(fileId, coverFile);
}
Expand All @@ -107,21 +113,7 @@ public String readPdfContent(File file, String fileId) {
PDFTextStripper pdfStripper = new PDFTextStripper();

for (int pageIndex = 0; pageIndex < document.getNumberOfPages(); pageIndex++) { // 使用 0-based 索引
int pageNumber = pageIndex + 1;
pdfStripper.setStartPage(pageNumber); // PDFTextStripper 使用 1-based 索引
pdfStripper.setEndPage(pageNumber);
String text = pdfStripper.getText(document).trim();

// 如果页面包含文字,添加提取的文字
if (!text.isEmpty()) {
content.append(text);
}
// 如果页面包含图片或没有文字,则进行 OCR
if (checkPageContent(document, pageIndex) || text.isEmpty()) {
if (ocrService.getOcrConfig().getEnable()) {
content.append(ocrService.extractPageWithOCR(file, pdfRenderer, pageIndex, document.getNumberOfPages(), username));
}
}
readPdfOfPage(file, pageIndex, pdfStripper, document, content, pdfRenderer, username);
}
return content.toString();
} catch (IOException e) {
Expand All @@ -130,6 +122,28 @@ public String readPdfContent(File file, String fileId) {
return null;
}

private void readPdfOfPage(File file, int pageIndex, PDFTextStripper pdfStripper, PDDocument document, StringBuilder content, PDFRenderer pdfRenderer, String username) {
try {
int pageNumber = pageIndex + 1;
pdfStripper.setStartPage(pageNumber); // PDFTextStripper 使用 1-based 索引
pdfStripper.setEndPage(pageNumber);
String text = pdfStripper.getText(document).trim();

// 如果页面包含文字,添加提取的文字
if (!text.isEmpty()) {
content.append(text);
}
// 如果页面包含图片或没有文字,则进行 OCR
if ((checkPageContent(document, pageIndex) || text.isEmpty()) && Boolean.TRUE.equals(ocrService.getOcrConfig().getEnable())) {
content.append(ocrService.extractPageWithOCR(file, pdfRenderer, pageIndex, document.getNumberOfPages(), username));
}
} catch (IOException e) {
log.error("提取文字失败, {}, 页数: {}", file.getName(), pageIndex, e);
} catch (Exception e) {
log.error("提取文字失败1, {}, 页数: {}", file.getName(), pageIndex, e);
}
}

public String readEpubContent(File file, String fileId) {
try (InputStream fileInputStream = new FileInputStream(file)) {
// 打开 EPUB 文件
Expand All @@ -138,7 +152,7 @@ public String readEpubContent(File file, String fileId) {

// 生成封面图像
String username = commonFileService.getUsernameByAbsolutePath(Path.of(file.getAbsolutePath()));
if (StrUtil.isNotBlank(fileId)) {
if (CharSequenceUtil.isNotBlank(fileId)) {
File coverFile = FileContentUtil.epubCoverImage(book, videoProcessService.getVideoCacheDir(username, fileId));
commonFileService.updateCoverFileDocument(fileId, coverFile);
}
Expand Down Expand Up @@ -206,6 +220,7 @@ private void readSlides(Iterable<?> slides, StringBuilder content) {
}
}
}

public String readWordContent(File file) {
try (FileInputStream fis = new FileInputStream(file)) {
try {
Expand Down

0 comments on commit d9c4da7

Please sign in to comment.