当前位置: 代码迷 >> 综合 >> Itext 读取pdf内容
  详细解决方案

Itext 读取pdf内容

热度:61   发布时间:2023-11-23 08:50:34.0

引入jar

     <dependency><groupId>com.itextpdf</groupId><artifactId>itextpdf</artifactId><version>5.5.13</version></dependency><!-- https://mvnrepository.com/artifact/com.itextpdf/itext-asian --><dependency><groupId>com.itextpdf</groupId><artifactId>itext-asian</artifactId><version>5.2.0</version></dependency>

工具类:

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;/*** @author * */
public class PdfUtils {/*** 按行提取文本* @param file* @return List<String>*/public static List<String> extractTXTbyLine(String file) {List<String> listArr = new ArrayList<String>();try {PdfReader reader = new PdfReader(file);int pageNum = reader.getNumberOfPages(); // 获得页数for (int i = 1; i <= pageNum; i++) { // 只能从第1页开始读String textFromPageContent = PdfTextExtractor.getTextFromPage(reader, i);String[] splitArray = textFromPageContent.split("\n");if (splitArray.length > 0) {listArr.addAll(Arrays.asList(splitArray));}}} catch (IOException ex) {Logger.getLogger(PdfUtils.class.getName()).log(Level.SEVERE, null, ex);}return listArr;}public static void main(String args[]) {String file = "F:\\文档\\test.pdf";long startTime = System.currentTimeMillis();List<String> strings = extractTXTbyLine(file);for (String s : strings) {System.out.println(s);}long endTime = System.currentTimeMillis();System.out.println("读写所用时间为:" + (endTime - startTime) + "ms");}
}