Apache tika 实现各种文档内容解析示例代码
作者:北执南念
这篇文章主要介绍了Apache tika 实现各种文档内容解析,本文通过实例代码给大家介绍的非常详细,感兴趣的朋友跟随小编一起看看吧
Apache tika 实现各种文档内容解析
1、依赖
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>cn.js</groupId> <artifactId>TikaResouce</artifactId> <version>1.0-SNAPSHOT</version> <properties> <maven.compiler.source>8</maven.compiler.source> <maven.compiler.target>8</maven.compiler.target> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <parent> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-parent</artifactId> <version>2.7.0</version> </parent> <dependencyManagement> <dependencies> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-bom</artifactId> <version>2.8.0</version> <type>pom</type> <scope>import</scope> </dependency> </dependencies> </dependencyManagement> <dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <dependency> <groupId>commons-fileupload</groupId> <artifactId>commons-fileupload</artifactId> <version>1.4</version> </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-parsers-standard-package</artifactId> </dependency> </dependencies> </project>
2、配置文件
新建一个 tika-config.xml 文件
<?xml version="1.0" encoding="UTF-8"?> <properties> <encodingDetectors> <encodingDetector class="org.apache.tika.parser.html.HtmlEncodingDetector"> <params> <param name="markLimit" type="int">64000</param> </params> </encodingDetector> <encodingDetector class="org.apache.tika.parser.txt.UniversalEncodingDetector"> <params> <param name="markLimit" type="int">64001</param> </params> </encodingDetector> <encodingDetector class="org.apache.tika.parser.txt.Icu4jEncodingDetector"> <params> <param name="markLimit" type="int">64002</param> </params> </encodingDetector> </encodingDetectors> </properties>
3、配置类
package cn.js.config; import java.io.IOException; import java.io.InputStream; import org.apache.tika.Tika; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.core.io.Resource; import org.springframework.core.io.ResourceLoader; import org.xml.sax.SAXException; /** * tika配置类 */ @Configuration public class MyTikaConfig { @Autowired private ResourceLoader resourceLoader; @Bean public Tika tika() throws TikaException, IOException, SAXException { Resource resource = resourceLoader.getResource("classpath:tika-config.xml"); InputStream inputStream = resource.getInputStream(); TikaConfig config = new TikaConfig(inputStream); Detector detector = config.getDetector(); Parser autoDetectParser = new AutoDetectParser(config); return new Tika(detector, autoDetectParser); } }
controller
package cn.js.controller; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.springframework.http.HttpRequest; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; import javax.annotation.Resource; import java.io.IOException; import java.io.InputStream; @RestController @RequestMapping("/tika") public class TikaController { @Resource private Tika tika; @PostMapping("/pdf") public void TikaDemon(@RequestParam("file") MultipartFile file) throws IOException, TikaException { InputStream inputStream = file.getInputStream(); String s = tika.parseToString(inputStream); System.out.println(s); } }
到此这篇关于Apache tika 实现各种文档内容解析的文章就介绍到这了,更多相关Apache tika 文档内容解析内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家!