Tika是一个用于文本解释的框架

Tika可以解析多种文档类型,除txt、html、xml等文本格式的文档外,还有大量的例如pdf、word、ppt等非文本格式的文档。在构建索引前,可以通过该框架提取文本数据

package com.mrdubo.main;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
import java.io.*;
public class FirstTika {
public String fileToText(File file) {
Parser parser = new AutoDetectParser();
InputStream is = null;
try {
Metadata metadata = new Metadata(); //元数据
metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
is = new FileInputStream(file);
BodyContentHandler bodyContentHandler = new BodyContentHandler();
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
parser.parse(is, bodyContentHandler, metadata, parseContext);
for (String name : metadata.names()) {
System.out.println(name + ":" + metadata.get(name));
}
return bodyContentHandler.toString();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
} finally {
try {
if (is != null) is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
public static void main(String[] args) {
FirstTika firstTika = new FirstTika();
System.out.println(firstTika.fileToText(new File("./pdf.pdf")));
//System.out.println(firstTika.fileToText(new File("./2017-10-1-GPS.log")));
}
}