package com.centit.support.office;


import com.centit.support.network.HttpExecutor;
import com.centit.support.office.exception.ExtractorTextException;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xdgf.extractor.XDGFVisioExtractor;
import org.apache.poi.xdgf.usermodel.XmlVisioDocument;
import org.apache.poi.xslf.usermodel.*;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.util.NodeList;
import org.htmlparser.visitors.HtmlPage;

import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import java.io.*;
import java.util.Iterator;

/**
 * 解析常用文件内容工具类
 */
public final class FileTextExtractor {


    /**
     * 返回Word文件内容 File
     * @param file
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorWord(File file) throws ExtractorTextException, IOException {
        InputStream input = new FileInputStream(file);
        return  extractorWord(input);
    }

    /**
     * word抽取 filePath
     * @param filePath
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorWord(String filePath) throws ExtractorTextException, IOException {
        return extractorWord(new File(filePath));
    }

    /**
     * word抽取 InputStream
     * @param input
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorWord(InputStream input) throws ExtractorTextException, IOException {
        String content=null;
        if (!input.markSupported()) {
//      转换成  PushbackinputStream
            input = new PushbackInputStream(input, 8);
        }
        if(POIFSFileSystem.hasPOIFSHeader(input)){//2003
            HWPFDocument document = new HWPFDocument(input);
            WordExtractor extractor = new WordExtractor(document);
            content = extractor.getText();
        }else if(POIXMLDocument.hasOOXMLHeader(input)){//2007
            XWPFDocument document = new XWPFDocument(input);
            XWPFWordExtractor extractor =new XWPFWordExtractor(document);
            content = extractor.getText();
        }else{
            //System.out.println("版本不支持");
        }
        if(input!=null)
            input.close();
        System.out.println("Word_Text:"+"\n"+content);
        return content;
    }


    /**
     * 返回Excel文件内容  File
     * @param file
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorExcel(File file) throws ExtractorTextException, IOException, InvalidFormatException {
        InputStream input = new FileInputStream(file);
        return  extractorExcel(input);
    }

    /**
     * 返回Excel文件内容  filePath
     * @param filePath
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorExcel(String filePath) throws ExtractorTextException, IOException, InvalidFormatException {
        return extractorExcel(new File(filePath));
    }

    /**
     * 返回Excel文件内容  InputStream
     * @param input
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorExcel(InputStream input) throws ExtractorTextException, IOException, InvalidFormatException {
        String content = null;
            if (!input.markSupported()) {
//          转换成  PushbackinputStream
                input = new PushbackInputStream(input, 8);
            }
            if (POIFSFileSystem.hasPOIFSHeader(input)) {//2003
                HSSFWorkbook wb = new HSSFWorkbook(input);
                ExcelExtractor extractor = new ExcelExtractor( wb);
                extractor.setFormulasNotResults(true);
                extractor.setIncludeSheetNames(true);
                content = extractor.getText();
            }else if (POIXMLDocument.hasOOXMLHeader(input)) {//2007
                XSSFWorkbook wb = new XSSFWorkbook(OPCPackage.open(input));
                POIXMLTextExtractor extractorX =new XSSFExcelExtractor(wb);
                content = extractorX.getText();
            }else{
                System.out.println("版本不支持");
            }
        if(input!=null)
            input.close();
        System.out.println("Excel_Text:"+"\n"+content);
        return content;
    }


    /**
     * 返回Ppt文件内容
     * @param file
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorPpt(File file) throws ExtractorTextException, IOException, InvalidFormatException {
        InputStream input = new FileInputStream(file);
        return  extractorPpt(input);
    }

    /**
     * 返回Ppt文件内容  filePath
     * @param filePath
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorPpt(String filePath) throws ExtractorTextException, IOException, InvalidFormatException {
        return extractorPpt(new File(filePath));
    }

    /**
     * 返回Ppt文件内容  InputStream
     * @param input
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorPpt(InputStream input) throws ExtractorTextException, IOException {
        String content="";
            if (!input.markSupported()) {
                input = new PushbackInputStream(input, 8);
            }
            if (POIFSFileSystem.hasPOIFSHeader(input)) {//2003
                PowerPointExtractor extractor = new PowerPointExtractor(input);
                content=extractor.getText();
                extractor.close();
            }else if (POIXMLDocument.hasOOXMLHeader(input)) {//2007
                XMLSlideShow ppt=new XMLSlideShow(input);
                for(XSLFSlide slide:ppt.getSlides()){ //遍历每一页ppt
                    for(XSLFShape shape:slide.getShapes()){
                        if(shape instanceof XSLFTextShape){ //获取到ppt的文本信息
                            for(Iterator iterator = ((XSLFTextShape) shape).iterator(); iterator.hasNext();){
                                //获取到每一段的文本信息
                                XSLFTextParagraph paragraph=(XSLFTextParagraph) iterator.next();
                                for (XSLFTextRun xslfTextRun : paragraph) {
                                    content+=xslfTextRun.getRawText()+"\t";
                                }
                            }
                        }
                    }
                    //获取一张ppt的内容后 换行
                    content+="\n";
                }
            }else{
                System.out.println("版本不支持");
            }
        if(input!=null)
            input.close();
        System.out.println("Ppt_Text:"+"\n"+content);
        return content;

    }



    /**
     * 返回visio文件内容
     * @param file
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorVisio(File file) throws ExtractorTextException, IOException {
        String content="";
        FileInputStream istream = null;
            istream = new FileInputStream(file);
//            XDGFVisioExtractor extractor = new XDGFVisioExtractor(OPCPackage.open(file));
            XDGFVisioExtractor extractor =new XDGFVisioExtractor(new XmlVisioDocument(istream));
            content = extractor.getText();
            if(istream!=null)
                    istream.close();
        System.out.println("Visio_Text:"+"\n"+content);
        return content;
    }

//    public  static String extractorVisio(String filePath) throws IOException {
//        VisioTextExtractor vsdExtractor = new VisioTextExtractor(
//                new FileInputStream(filePath));
//        String vsdString = vsdExtractor.getText();
//        vsdExtractor.close();
//        System.out.println("Visio_Text:"+"\n"+vsdString.toString());
//        return vsdString;
//    }

    /**
     * 返回outlook文件内容  File
     * @param file
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorOutLook(File file) throws ExtractorTextException, IOException {
        InputStream input = new FileInputStream(file);
        return  extractorOutLook(input);
    }

    /**
     * 返回outlook文件内容  filePath
     * @param filePath
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorOutLook(String filePath) throws ExtractorTextException, IOException {
        return extractorOutLook(new File(filePath));
    }
    /**
     * 返回outlook文件内容  InputStream
     * @param input
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorOutLook(InputStream input) throws ExtractorTextException, IOException {
        String content="";
        NPOIFSFileSystem poifs = null;
        OutlookTextExtactor extractor = null;
            poifs = new NPOIFSFileSystem(input);
            extractor = new OutlookTextExtactor(poifs);
            content = extractor.getText();
            if (extractor != null) extractor.close();
            if (poifs != null) poifs.close();
            if(input!=null)
                    input.close();
        System.out.println("OutLook_Text:"+"\n"+content);
        return content;
    }


    /**
     * 返回Pdf文件内容  File
     * @param file
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorPdf(File file) throws ExtractorTextException, IOException {
        InputStream input = new FileInputStream(file);
        return  extractorPdf(input);
    }

    /**
     * 返回Pdf文件内容  filePath
     * @param filePath
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorPdf(String filePath) throws ExtractorTextException, IOException {
        return extractorPdf(new File(filePath));
    }
    /**
     * 返回Pdf文件内容  InputStream
     * @param input
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorPdf(InputStream input) throws ExtractorTextException, IOException {
        String content = "";  //存放读取出的文档内容
        PdfReader reader = new PdfReader(input); //读取pdf所使用的输出流
        int PageNum = reader.getNumberOfPages();//获得页数
        for (int i = 1; i <=PageNum; i++){
            content += PdfTextExtractor.getTextFromPage(reader, i);//读取第i页的文档内容
        }
        if(input!=null)
            input.close();
        System.out.println("txt_Text:" + "\n" + content);
        return content;
    }


    /**
     * 返回txt文件内容  File
     * @param file
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorTxt(File file) throws ExtractorTextException, IOException {
        InputStream input = new FileInputStream(file);
        return  extractorTxt(input);
    }

    /**
     * 返回txt文件内容  filePath
     * @param filePath
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorTxt(String filePath) throws ExtractorTextException, IOException {
        return extractorTxt(new File(filePath));
    }

        /**
         * 返回txt文件内容  InputStream
         *
         * @param input
         * @return
         * @throws ExtractorTextException
         */
        public static String extractorTxt (InputStream input) throws ExtractorTextException, IOException {
            String eCode = null;
            StringBuffer sb = new StringBuffer("");
                BufferedInputStream bin = new BufferedInputStream(input);
                int p = (bin.read() << 8) + bin.read();
                switch (p) {
                    case 0xefbb:
                        eCode = "UTF-8";
                        break;
                    case 0xfffe:
                        eCode = "Unicode";
                        break;
                    case 0xfeff:
                        eCode = "UTF-16BE";
                        break;
                    case 0x5c75:
                        eCode = "ASCII";
                        break;
                    default:
                        eCode = "GBK";
                }
            BufferedReader reader = null;
                reader = new BufferedReader(new InputStreamReader(input, eCode));
                String line = "";
                while ((line = reader.readLine()) != null) {
                    sb.append(line + "\n");
                }
            if(input!=null)
                input.close();
            System.out.println("txt_Text:" + "\n" + sb.toString());
            return sb.toString();
    }



    /**
     * 抽取rtf格式  File
     * @param file
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorRtf(File file) throws ExtractorTextException, IOException, BadLocationException {
        InputStream input = new FileInputStream(file);
        return  extractorRtf(input);
    }

    /**
     * 抽取rtf格式  filePath
     * @param filePath
     * @return
     * @throws ExtractorTextException
     * @throws IOException
     */
    public static String extractorRtf(String filePath) throws ExtractorTextException, IOException, BadLocationException {
        return extractorRtf(new File(filePath));
    }
    /**
     * 抽取rtf格式  InputStream
     * @param input
     * @return
     */
    public  static String extractorRtf(InputStream input) throws IOException, BadLocationException {
        DefaultStyledDocument styledDoc = new DefaultStyledDocument();
        new RTFEditorKit().read(input, styledDoc, 0);
//      tring bodyText = new String(styledDoc.getText(0, styledDoc.getLength()).getBytes("ISO8859-1"));
        String bodyText = styledDoc.getText(0, styledDoc.getLength());
        if(input!=null)
            input.close();
        System.out.println("txt_Rtf:" + "\n" + bodyText);
        return bodyText;
    }


    /**
     * 读取HTML文件内容
     * @param urlString
     * @return
     * @throws ExtractorTextException
     */
    public static String extractorHTML(String urlString) throws ExtractorTextException, IOException {
        StringBuffer content = new StringBuffer("");
        try {
            Parser parser = null;
            parser = new Parser(urlString);
//            String ec = parser.getEncoding();
            HtmlPage visitor = new HtmlPage(parser);
            parser.visitAllNodesWith(visitor);
            NodeList nodes = visitor.getBody();
            int size = nodes.size();
            for (int i = 0; i < size; i++) {
                Node node = nodes.elementAt(i);
                content.append(node.toPlainTextString());
            }
            System.out.println("test_HTML" + "\n" + content.toString());
            return content.toString();
        } catch (Exception e) {
            e.getMessage();
            return null;
        }
    }

    public  static String getWebUrl(String urlString) {
        StringBuffer content = new StringBuffer("");
        try {
            String inputHTML = HttpExecutor.simpleGet(
                    HttpExecutor.createHttpClient(),
                    urlString, (String)null);
            //System.out.println(inputHTML);
            Parser parser = new Parser();
            parser.setInputHTML(inputHTML);

            HtmlPage visitor = new HtmlPage(parser);
            parser.visitAllNodesWith(visitor);
            NodeList nodes = visitor.getBody();
            int size = nodes.size();
            for(int i=0;i<size;i++){
                Node node = nodes.elementAt(i);
                content.append( node.toPlainTextString() );
            }
            return content.toString();
        } catch (Exception e) {
            e.getMessage();//e.printStackTrace();
            return null;
        }
    }
}
