Programming/java

[java] 문서 파싱 및 추출(pdf, doc, docx, xls, xlsx, ppt, pptx)

성일만 2014. 11. 13. 10:45

문서 파싱 및 추출(pdf, doc, docx, xls, xlsx, ppt, pptx)



필요 라이브러리



pdf 파서

import java.io.File;

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import org.apache.pdfbox.cos.COSDocument;

import org.apache.pdfbox.pdfparser.PDFParser;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.util.PDFTextStripper;

 

/**

 * This class parses the pdf file.

 * i.e this class returns the text from the pdf file.

 * @author Mubin Shrestha

 */

public class PdfFileParser {

 

    public String PdfFileParser(String pdffilePath) throws FileNotFoundException, IOException

    {

        String content;

        FileInputStream fi = new FileInputStream(new File(pdffilePath));

        PDFParser parser = new PDFParser(fi);

        parser.parse();

        COSDocument cd = parser.getDocument();

        PDFTextStripper stripper = new PDFTextStripper();

        content = stripper.getText(new PDDocument(cd));

        cd.close();

        return content;

    }

     

    public static void main(String args[]) throws FileNotFoundException, IOException

    {

        String filepath = "fullPath";

        System.out.println(new PdfFileParser().PdfFileParser(filepath));    

    }

}



doc, xls, ppt 파서

import java.io.FileInputStream;

import org.apache.poi.hslf.extractor.PowerPointExtractor;

import org.apache.poi.hssf.extractor.ExcelExtractor;

import org.apache.poi.hssf.usermodel.HSSFWorkbook;

import org.apache.poi.hwpf.HWPFDocument;

import org.apache.poi.hwpf.extractor.WordExtractor;

import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 

/**

* This class parses the microsoft word files except .docx,.pptx and 

* latest MSword files.

* @author Mubin Shrestha

*/

public class DocFileParser {

   

  public String DocFileContentParser(String fileName) {

      POIFSFileSystem fs = null;

      try {

          

          fs = new POIFSFileSystem(new FileInputStream(fileName));

           

          if(fileName.endsWith(".doc")) {

              HWPFDocument doc = new HWPFDocument(fs);

              WordExtractor we = new WordExtractor(doc); 

              return we.getText();

          }else if(fileName.endsWith(".xls")) {

//              HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(fileName));

              ExcelExtractor ex = new ExcelExtractor(fs);

              ex.setFormulasNotResults(true);

              ex.setIncludeSheetNames(true);

              return ex.getText();

          } else if (fileName.endsWith(".ppt")) {

              PowerPointExtractor extractor = new PowerPointExtractor(fs);

              return extractor.getText();

          }

 

      } catch (Exception e) {

          System.out.println("document file cant be indexed");

      }

      return "";

  }

 

  public static void main(String args[]){

      String filepath = "fullPath";

      System.out.println(new DocFileParser().DocFileContentParser(filepath));

  }

}



docx, xlsx, pptx 파서

import java.io.File;

import java.io.FileInputStream;

 

import org.apache.poi.openxml4j.opc.OPCPackage;

import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;

import org.apache.poi.xssf.extractor.XSSFExcelExtractor;

import org.apache.poi.xwpf.extractor.XWPFWordExtractor;

 

public class DocxFileParser {

 

    public String docxFileContentParser(String fileName){

 

        try{

            FileInputStream fs = new FileInputStream(new File(fileName));

            OPCPackage d = OPCPackage.open(fs);

            if(fileName.endsWith(".docx")){

                XWPFWordExtractor xw = new XWPFWordExtractor(d);

                return xw.getText();

            }else if(fileName.endsWith(".pptx")){

                XSLFPowerPointExtractor xp = new XSLFPowerPointExtractor(d);

                return xp.getText();

            }else if(fileName.endsWith(".xlsx")){

                XSSFExcelExtractor xe = new XSSFExcelExtractor(d);

                xe.setFormulasNotResults(true);

                xe.setIncludeSheetNames(true);

                return xe.getText();

            }

        }catch(Exception e){

            System.out.println("# DocxFileParser Error :"+e.getMessage());

        }

        return "";

    }

 

    public static void main(String args[]){

        String filePath = "fullPath";

        System.out.println(new DocxFileParser().docxFileContentParser(filePath));

    }    

}


출처 : http://computergodzilla.blogspot.kr







'Programming > java' 카테고리의 다른 글

[java] 몇 분전, 몇 시간전, 몇 일전 표현 Util  (0) 2014.12.11
[java] WebUtils 및 FileUtils  (0) 2014.11.18
[java] J2EE  (0) 2014.09.18
[java] 스레드(Thread)  (0) 2014.09.17
[java] 예외처리(Exception)  (0) 2014.09.17