문서 파싱 및 추출(pdf, doc, docx, xls, xlsx, ppt, pptx)
필요 라이브러리
Apache PDFBox : http://pdfbox.apache.org/downloads.html
Apache POI : http://poi.apache.org/download.html
pdf 파서
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* This class parses the pdf file.
* i.e this class returns the text from the pdf file.
* @author Mubin Shrestha
*/
public class PdfFileParser {
public String PdfFileParser(String pdffilePath) throws FileNotFoundException, IOException
{
String content;
FileInputStream fi = new FileInputStream(new File(pdffilePath));
PDFParser parser = new PDFParser(fi);
parser.parse();
COSDocument cd = parser.getDocument();
PDFTextStripper stripper = new PDFTextStripper();
content = stripper.getText(new PDDocument(cd));
cd.close();
return content;
}
public static void main(String args[]) throws FileNotFoundException, IOException
{
String filepath = "fullPath";
System.out.println(new PdfFileParser().PdfFileParser(filepath));
}
}
doc, xls, ppt 파서
import java.io.FileInputStream;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* This class parses the microsoft word files except .docx,.pptx and
* latest MSword files.
*
* @author Mubin Shrestha
*/
public class DocFileParser {
public String DocFileContentParser(String fileName) {
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(new FileInputStream(fileName));
if(fileName.endsWith(".doc")) {
HWPFDocument doc = new HWPFDocument(fs);
WordExtractor we = new WordExtractor(doc);
return we.getText();
}else if(fileName.endsWith(".xls")) {
// HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(fileName));
ExcelExtractor ex = new ExcelExtractor(fs);
ex.setFormulasNotResults(true);
ex.setIncludeSheetNames(true);
return ex.getText();
} else if (fileName.endsWith(".ppt")) {
PowerPointExtractor extractor = new PowerPointExtractor(fs);
return extractor.getText();
}
} catch (Exception e) {
System.out.println("document file cant be indexed");
}
return "";
}
public static void main(String args[]){
String filepath = "fullPath";
System.out.println(new DocFileParser().DocFileContentParser(filepath));
}
}
docx, xlsx, pptx 파서
import java.io.File;
import java.io.FileInputStream;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
public class DocxFileParser {
public String docxFileContentParser(String fileName){
try{
FileInputStream fs = new FileInputStream(new File(fileName));
OPCPackage d = OPCPackage.open(fs);
if(fileName.endsWith(".docx")){
XWPFWordExtractor xw = new XWPFWordExtractor(d);
return xw.getText();
}else if(fileName.endsWith(".pptx")){
XSLFPowerPointExtractor xp = new XSLFPowerPointExtractor(d);
return xp.getText();
}else if(fileName.endsWith(".xlsx")){
XSSFExcelExtractor xe = new XSSFExcelExtractor(d);
xe.setFormulasNotResults(true);
xe.setIncludeSheetNames(true);
return xe.getText();
}
}catch(Exception e){
System.out.println("# DocxFileParser Error :"+e.getMessage());
}
return "";
}
public static void main(String args[]){
String filePath = "fullPath";
System.out.println(new DocxFileParser().docxFileContentParser(filePath));
}
}
출처 : http://computergodzilla.blogspot.kr
'Programming > java' 카테고리의 다른 글
[java] 몇 분전, 몇 시간전, 몇 일전 표현 Util (0) | 2014.12.11 |
---|---|
[java] WebUtils 및 FileUtils (0) | 2014.11.18 |
[java] J2EE (0) | 2014.09.18 |
[java] 스레드(Thread) (0) | 2014.09.17 |
[java] 예외처리(Exception) (0) | 2014.09.17 |