|
@@ -6,7 +6,6 @@ import com.lowagie.text.pdf.PdfContentByte;
|
|
|
import com.lowagie.text.pdf.PdfReader;
|
|
|
import com.lowagie.text.pdf.PdfStamper;
|
|
|
import com.lowagie.text.pdf.parser.PdfTextExtractor;
|
|
|
-import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
|
|
|
import fr.opensagres.poi.xwpf.converter.core.FileURIResolver;
|
|
|
import fr.opensagres.poi.xwpf.converter.core.ImageManager;
|
|
|
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
|
|
@@ -19,7 +18,6 @@ import org.jsoup.nodes.Entities;
|
|
|
import org.jsoup.select.Elements;
|
|
|
import org.xhtmlrenderer.pdf.ITextFontResolver;
|
|
|
import org.xhtmlrenderer.pdf.ITextRenderer;
|
|
|
-
|
|
|
import java.awt.Color;
|
|
|
import java.io.ByteArrayOutputStream;
|
|
|
import java.io.File;
|
|
@@ -30,7 +28,6 @@ import java.io.InputStream;
|
|
|
import java.io.OutputStream;
|
|
|
import java.nio.file.Path;
|
|
|
import java.nio.file.Paths;
|
|
|
-import java.util.ArrayList;
|
|
|
import java.util.HashSet;
|
|
|
import java.util.LinkedHashMap;
|
|
|
import java.util.List;
|
|
@@ -42,7 +39,6 @@ import java.util.regex.Pattern;
|
|
|
public class OfficeUtil1 {
|
|
|
private static final org.slf4j.Logger OFFICE_UTIL_LOGGER = org.slf4j.LoggerFactory.getLogger(OfficeUtil1.class);
|
|
|
private static Map<String, Integer> pageNumberMap = new LinkedHashMap<>();
|
|
|
-
|
|
|
public static String convert(String docxPath, String imageDir) throws IOException {
|
|
|
File imageDirFile = new File(imageDir);
|
|
|
if (!imageDirFile.exists() && !imageDirFile.mkdirs()) {
|
|
@@ -53,40 +49,32 @@ public class OfficeUtil1 {
|
|
|
XWPFDocument document = new XWPFDocument(docxIn);
|
|
|
ByteArrayOutputStream htmlOut = new ByteArrayOutputStream()) {
|
|
|
|
|
|
- // 执行转换
|
|
|
- XHTMLOptions options = createHtmlOptions(imageDirFile);
|
|
|
- XHTMLConverter.getInstance().convert(document, htmlOut, options);
|
|
|
-
|
|
|
- return htmlOut.toString("UTF-8");
|
|
|
- } catch (Exception e) {
|
|
|
- OFFICE_UTIL_LOGGER.error("转换失败: {}", e.getMessage(), e);
|
|
|
- throw new IOException("DOCX转换失败", e);
|
|
|
- }
|
|
|
- }
|
|
|
+ // 自定义 ImageManager:禁止生成 word/media 目录,强制使用 imageDir 根目录
|
|
|
+ ImageManager imageManager = new ImageManager(imageDirFile, "") {
|
|
|
+ @Override
|
|
|
+ public String resolve(String uri) {
|
|
|
|
|
|
+ return new File(imageDir, uri).getAbsolutePath().replace("/", "\\").toString();
|
|
|
+ }
|
|
|
|
|
|
- /**
|
|
|
- * 创建HTML转换选项
|
|
|
- */
|
|
|
- private static XHTMLOptions createHtmlOptions(File imageDirFile) {
|
|
|
- @SuppressWarnings("deprecation")
|
|
|
- XHTMLOptions options = XHTMLOptions.create()
|
|
|
- .setImageManager(new ImageManager(imageDirFile, "") {
|
|
|
- @Override
|
|
|
- public String resolve(String uri) {
|
|
|
- return new File(imageDirFile, uri).getAbsolutePath().replace("/", "\\");
|
|
|
- }
|
|
|
- })
|
|
|
- .URIResolver(new FileURIResolver(imageDirFile) {
|
|
|
- @Override
|
|
|
- public String resolve(String uri) {
|
|
|
- return new File(imageDirFile, uri).getAbsolutePath().replace("/", "\\");
|
|
|
- }
|
|
|
- });
|
|
|
+ };
|
|
|
+
|
|
|
+ XHTMLOptions options = XHTMLOptions.create()
|
|
|
+ .setImageManager(imageManager)
|
|
|
+ .URIResolver(new FileURIResolver(imageDirFile) {
|
|
|
+ @Override
|
|
|
+ public String resolve(String uri) {
|
|
|
+ // 去除 word/media/ 前缀
|
|
|
+ String filename = uri.replace("word/media/", "");
|
|
|
+ return new File(imageDirFile, filename)
|
|
|
+ .getAbsolutePath()
|
|
|
+ .replace("/", "\\");
|
|
|
+ }
|
|
|
+ });
|
|
|
|
|
|
- options.setIgnoreStylesIfUnused(false);
|
|
|
- options.setExtractor(new FileImageExtractor(imageDirFile));
|
|
|
- return options;
|
|
|
+ XHTMLConverter.getInstance().convert(document, htmlOut, options);
|
|
|
+ return htmlOut.toString("UTF-8");
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
public static String formatHtml(String html) {
|
|
@@ -201,7 +189,7 @@ public class OfficeUtil1 {
|
|
|
Element firstImg = images.first();
|
|
|
// 4. 删除第一个img元素
|
|
|
firstImg.parent().remove();
|
|
|
- // 方法二:将所有 white-space:pre-wrap 改为 normal去除转换时的奇怪空白
|
|
|
+ // 将所有 white-space:pre-wrap 改为 normal去除转换时的奇怪空白
|
|
|
Elements allElements = doc.getAllElements();
|
|
|
|
|
|
for (Element element : allElements) {
|
|
@@ -392,7 +380,7 @@ public class OfficeUtil1 {
|
|
|
String newFileName = baseName + "1" + extension;
|
|
|
// 构建完整新路径
|
|
|
String newFilePath = path.resolveSibling(newFileName).toString();
|
|
|
- pdfReader(outputPdfPath, newFilePath, imagePath + "\\image1.jpeg", flag);
|
|
|
+ pdfReader(outputPdfPath, newFilePath, imagePath + File.separator + "image1.jpeg", flag);
|
|
|
|
|
|
return newFilePath;
|
|
|
}
|
|
@@ -429,8 +417,9 @@ public class OfficeUtil1 {
|
|
|
|
|
|
// 收集标题和页码
|
|
|
Pattern titlePattern = Pattern.compile(
|
|
|
- "^((\\d+)\\.\\s+|(\\d+\\.\\d+)\\s+)([\\u4e00-\\u9fa5a-zA-Z].*)$",
|
|
|
- Pattern.MULTILINE);
|
|
|
+ "^((\\d+)\\.\\s+|(\\d+\\.\\d+)\\s+)([\\u4e00-\\u9fa5a-zA-Z0-9].*)$",
|
|
|
+ Pattern.MULTILINE
|
|
|
+ );
|
|
|
Pattern specialPattern = Pattern.compile("^重要声明\\s*[::]?\\s*(.*)$");
|
|
|
|
|
|
for (int pageNum = startPage; pageNum <= reader.getNumberOfPages(); pageNum++) {
|
|
@@ -445,6 +434,10 @@ public class OfficeUtil1 {
|
|
|
String nextLine = (i + 1 < lines.length) ? lines[i + 1].trim() : "";
|
|
|
line = line + (nextLine.isEmpty() ? "" : nextLine);
|
|
|
}
|
|
|
+ if (line.startsWith("7.8 机构管理层(法定代表人,理事长,秘书长)")) {
|
|
|
+ String nextLine = (i + 1 < lines.length) ? lines[i + 1].trim() : "";
|
|
|
+ line = line + (nextLine.isEmpty() ? "" : nextLine);
|
|
|
+ }
|
|
|
|
|
|
Matcher matcher = titlePattern.matcher(line);
|
|
|
if (matcher.matches()) {
|
|
@@ -513,15 +506,6 @@ public class OfficeUtil1 {
|
|
|
image.scaleAbsolute(PageSize.A4.getWidth(), PageSize.A4.getHeight());
|
|
|
image.setAbsolutePosition(0, 0);
|
|
|
background.addImage(image);
|
|
|
- int lastPageIndex = reader.getNumberOfPages();
|
|
|
- //删除最后一页空白页
|
|
|
- String lastPageText = new PdfTextExtractor(reader).getTextFromPage(lastPageIndex);
|
|
|
- // 使用 selectPages 方法删除最后一页
|
|
|
- ArrayList<Integer> pagesToKeep = new ArrayList<>();
|
|
|
- for (int i = 1; i < lastPageIndex; i++) {
|
|
|
- pagesToKeep.add(i);
|
|
|
- }
|
|
|
- reader.selectPages(pagesToKeep);
|
|
|
|
|
|
stamper.close();
|
|
|
reader.close();
|