code.onehippo.org is currently readonly. We are migrating to code.bloomreach.com, please continue working there on Monday 14/12. See: https://docs.bloomreach.com/display/engineering/GitLab

Commit 56ff16ba authored by Ate Douma's avatar Ate Douma

REPO-1950 Upgrade to jackrabbit-2.16.1-h1 and Tika 1.17

This introduces the new hippo-repository-tika module which now takes care of managing all the tika-parsers related dependencies (and exclusions),
and provides a new TikaFactory for loading the hippo-repository specific tika-config.xml, and creating new Tika instances using the corresponding TikaConfig.

The tika-core/tika-parsers dependency management previously configured in the hippo-cms7-project parent no longer can/should be used, and thus will be removed.
parent 46c0fcd9
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright 2007-2017 Hippo B.V. (http://www.onehippo.com)
Copyright 2007-2018 Hippo B.V. (http://www.onehippo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -28,6 +28,12 @@
<packaging>pom</packaging>
<dependencies>
<dependency>
<groupId>org.onehippo.cms7</groupId>
<artifactId>hippo-repository-tika</artifactId>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.onehippo.cms7</groupId>
<artifactId>hippo-repository-builtin</artifactId>
......
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright 2007-2017 Hippo B.V. (http://www.onehippo.com)
Copyright 2007-2018 Hippo B.V. (http://www.onehippo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -59,6 +59,11 @@
<artifactId>hippo-repository-api</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.onehippo.cms7</groupId>
<artifactId>hippo-repository-tika</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.onehippo.cms7</groupId>
<artifactId>hippo-repository-config</artifactId>
......@@ -94,15 +99,6 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans-java6</artifactId>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
......@@ -111,10 +107,6 @@
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
</dependency>
<dependency>
<groupId>org.codehaus.groovy</groupId>
......
/*
* Copyright 2008-2016 Hippo B.V. (http://www.onehippo.com)
* Copyright 2008-2018 Hippo B.V. (http://www.onehippo.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -16,7 +16,6 @@
package org.hippoecm.repository;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
......@@ -89,6 +88,7 @@ import org.hippoecm.repository.query.lucene.ServicingNameFormat;
import org.hippoecm.repository.query.lucene.ServicingSearchIndex;
import org.hippoecm.repository.query.lucene.util.CachingMultiReaderQueryFilter;
import org.hippoecm.repository.query.lucene.util.SetDocIdSetBuilder;
import org.onehippo.repository.tika.TikaFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
......@@ -647,6 +647,9 @@ public class FacetedNavigationEngineImpl extends ServicingSearchIndex
QueryHandlerContext context = getContext();
HippoSharedItemStateManager stateMgr = (HippoSharedItemStateManager) context.getItemStateManager();
stateMgr.repository.setFacetedNavigationEngine(this);
if (getTikaConfigPath() == null) {
setTikaConfigPath(TikaFactory.getTikaConfigPath());
}
super.doInit();
}
......
/*
* Copyright 2012-2013 Hippo B.V. (http://www.onehippo.com)
* Copyright 2012-2018 Hippo B.V. (http://www.onehippo.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -17,11 +17,9 @@ package org.hippoecm.repository;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.Calendar;
import javax.jcr.Node;
......@@ -30,9 +28,10 @@ import javax.jcr.query.QueryResult;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.value.BinaryImpl;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripper;
import org.hippoecm.repository.api.HippoNodeType;
import org.junit.Before;
import org.junit.Test;
......@@ -156,7 +155,7 @@ public class PdfExtractedTextWithLineBreaksAreIndexedCorrectlyTest extends Repos
{
InputStream pdf = this.getClass().getResourceAsStream(WORDS_ON_NEW_LINE_WITHOUT_SPACES);
try {
PDFParser parser = new PDFParser(new BufferedInputStream(pdf));
PDFParser parser = new PDFParser(new RandomAccessBuffer(pdf));
PDDocument pdDocument = null;
try {
parser.parse();
......
/*
* Copyright 2010-2013 Hippo B.V. (http://www.onehippo.com)
* Copyright 2010-2018 Hippo B.V. (http://www.onehippo.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
......@@ -15,7 +15,6 @@
*/
package org.hippoecm.repository;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.CharArrayWriter;
......@@ -30,9 +29,10 @@ import javax.jcr.query.QueryResult;
import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.value.BinaryImpl;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripper;
import org.hippoecm.repository.api.HippoNodeType;
import org.junit.Before;
import org.junit.Test;
......@@ -172,7 +172,7 @@ public class PdfExtractionAndIndexingTest extends RepositoryTestCase {
if(includeHippoText) {
InputStream pdf = this.getClass().getResourceAsStream(UNITTEST_PDF_FILE_NAME);
try {
PDFParser parser = new PDFParser(new BufferedInputStream(pdf));
PDFParser parser = new PDFParser(new RandomAccessBuffer(pdf));
PDDocument pdDocument = null;
try {
parser.parse();
......@@ -237,7 +237,7 @@ public class PdfExtractionAndIndexingTest extends RepositoryTestCase {
} else {
InputStream pdf = this.getClass().getResourceAsStream(UNITTEST_PDF_FILE_NAME);
try {
PDFParser parser = new PDFParser(new BufferedInputStream(pdf));
PDFParser parser = new PDFParser(new RandomAccessBuffer(pdf));
PDDocument pdDocument = null;
try {
parser.parse();
......
......@@ -20,7 +20,7 @@
<parent>
<groupId>org.onehippo.cms7</groupId>
<artifactId>hippo-cms7-project</artifactId>
<version>29.3</version>
<version>29.4-SNAPSHOT</version>
</parent>
<name>Repository</name>
......@@ -47,6 +47,7 @@
<modules>
<module>api</module>
<module>tika</module>
<module>connector</module>
<module>provider</module>
<module>engine</module>
......@@ -72,7 +73,7 @@
<!-- use root project name for all project modules NOTICE files, should be the same as in the root NOTICE file -->
<notice.project.name>Hippo Repository</notice.project.name>
<hippo.jackrabbit.version>2.14.0-h2</hippo.jackrabbit.version>
<hippo.jackrabbit.version>2.16.1-h1-SNAPSHOT</hippo.jackrabbit.version>
<hippo.configuration-management.version>1.3.0-SNAPSHOT</hippo.configuration-management.version>
<hippo.commons.version>4.3.0-SNAPSHOT</hippo.commons.version>
<hippo.services.version>4.3.0-SNAPSHOT</hippo.services.version>
......@@ -92,8 +93,9 @@
<commons-beanutils.version>1.9.3</commons-beanutils.version>
<commons-collections.version>3.2.2</commons-collections.version>
<commons-collections4.version>4.1</commons-collections4.version>
<xmlbeans-java6.version>2.3.0</xmlbeans-java6.version>
<pdfbox.version>1.8.8</pdfbox.version>
<tika.version>1.17</tika.version>
<!-- keep pdfbox version aligned with the version tika-parsers depends on -->
<pdfbox.version>2.0.8</pdfbox.version>
<quartz.version>2.2.1</quartz.version>
<json-lib.version>2.4</json-lib.version>
<easymock.version>3.4</easymock.version>
......@@ -187,6 +189,158 @@
<version>${javax.jcr.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${tika.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jul-to-slf4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.gagravarr</groupId>
<artifactId>vorbis-java-tika</artifactId>
</exclusion>
<exclusion>
<groupId>org.gagravarr</groupId>
<artifactId>vorbis-java-core</artifactId>
</exclusion>
<exclusion>
<groupId>com.googlecode.mp4parser</groupId>
<artifactId>isoparser</artifactId>
</exclusion>
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>netcdf4</artifactId>
</exclusion>
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>grib</artifactId>
</exclusion>
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>cdm</artifactId>
</exclusion>
<exclusion>
<groupId>edu.ucar</groupId>
<artifactId>httpservices</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.sis.core</groupId>
<artifactId>sis-utility</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.sis.storage</groupId>
<artifactId>sis-netcdf</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.sis.core</groupId>
<artifactId>sis-metadata</artifactId>
</exclusion>
<exclusion>
<groupId>org.opengis</groupId>
<artifactId>geoapi</artifactId>
</exclusion>
<exclusion>
<groupId>edu.usc.ir</groupId>
<artifactId>sentiment-analysis-parser</artifactId>
</exclusion>
<exclusion>
<groupId>com.healthmarketscience.jackcess</groupId>
<artifactId>jackcess</artifactId>
</exclusion>
<exclusion>
<groupId>com.healthmarketscience.jackcess</groupId>
<artifactId>jackcess-encrypt</artifactId>
</exclusion>
<exclusion>
<groupId>org.tallison</groupId>
<artifactId>jmatio</artifactId>
</exclusion>
<exclusion>
<groupId>org.codelibs</groupId>
<artifactId>jhighlight</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.james</groupId>
<artifactId>apache-mime4j-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.james</groupId>
<artifactId>apache-mime4j-dom</artifactId>
</exclusion>
<exclusion>
<groupId>com.drewnoakes</groupId>
<artifactId>metadata-extractor</artifactId>
</exclusion>
<exclusion>
<groupId>com.rometools</groupId>
<artifactId>rome</artifactId>
</exclusion>
<exclusion>
<groupId>org.ow2.asm</groupId>
<artifactId>asm</artifactId>
</exclusion>
<exclusion>
<artifactId>boilerpipe</artifactId>
<groupId>de.l3s.boilerpipe</groupId>
</exclusion>
<exclusion>
<groupId>com.pff</groupId>
<artifactId>java-libpst</artifactId>
</exclusion>
<exclusion>
<groupId>com.github.junrar</groupId>
<artifactId>junrar</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.cxf</groupId>
<artifactId>cxf-rt-rs-client</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-exec</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
</exclusion>
<exclusion>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
</exclusion>
<exclusion>
<groupId>com.tdunning</groupId>
<artifactId>json</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.jackrabbit</groupId>
<artifactId>jackrabbit-api</artifactId>
......@@ -238,11 +392,6 @@
<artifactId>pdfbox</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans-java6</artifactId>
<version>${xmlbeans-java6.version}</version>
</dependency>
<dependency>
<groupId>commons-beanutils</groupId>
<artifactId>commons-beanutils</artifactId>
......
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright 2018 Hippo B.V. (http://www.onehippo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.onehippo.cms7</groupId>
<artifactId>hippo-repository</artifactId>
<version>5.3.0-SNAPSHOT</version>
</parent>
<name>Repository Tika Config Provider</name>
<description>Hippo Repository Tika Config Provider</description>
<artifactId>hippo-repository-tika</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
</dependencies>
<build>
<defaultGoal>package</defaultGoal>
</build>
</project>
/*
* Copyright 2018 Hippo B.V. (http://www.onehippo.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.onehippo.repository.tika;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.net.URL;
public final class TikaFactory {
private static final String TIKA_CONFIG_RESOURCE_NAME = "tika-config.xml";
private static final String TIKA_CONFIG_RESOURCE_PATH =
TikaFactory.class.getPackage().getName().replace('.', '/') + "/" + TIKA_CONFIG_RESOURCE_NAME;
private TikaConfig tikaConfig;
private TikaFactory() {
try {
tikaConfig = new TikaConfig(getTikaConfigURL());
} catch (IOException e) {
throw new RuntimeException("Failed to load "+TIKA_CONFIG_RESOURCE_PATH, e);
} catch (SAXException e) {
throw new RuntimeException("Failed to parse "+TIKA_CONFIG_RESOURCE_PATH, e);
} catch (TikaException e) {
throw new RuntimeException("Failed to instantiate Tika", e);
}
}
private static class SingletonHelper {
private static final TikaFactory INSTANCE = new TikaFactory();
}
public static String getTikaConfigPath() {
return TIKA_CONFIG_RESOURCE_PATH;
}
public static URL getTikaConfigURL() {
return TikaFactory.class.getResource(TIKA_CONFIG_RESOURCE_NAME);
}
public static TikaConfig getTikaConfig() {
return SingletonHelper.INSTANCE.tikaConfig;
}
public static Tika newTika() {
return new Tika(getTikaConfig());
}
public static Tika newTika(final Detector detector, final Parser parser) {
return new Tika(detector, parser, getTikaConfig().getTranslator());
}
}
<?xml version="1.0" encoding="UTF-8"?>
<!--
Copyright 2018 Hippo B.V. (http://www.onehippo.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<properties>
<parsers>
<!-- Instead of using the DefaultParser, which tries to load all the parsers configured in
tika-parsers.jar/META-INF/services/org.apache.tika.parser.Parsers
only load those actually useful/supported, e.g. text, html or office document based -->
<parser class="org.apache.tika.parser.CompositeParser">
<parser class="org.apache.tika.parser.chm.ChmParser"/>
<parser class="org.apache.tika.parser.epub.EpubParser"/>
<parser class="org.apache.tika.parser.html.HtmlParser"/>
<parser class="org.apache.tika.parser.iwork.IWorkPackageParser"/>
<parser class="org.apache.tika.parser.microsoft.OfficeParser"/>
<parser class="org.apache.tika.parser.microsoft.OldExcelParser"/>
<parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/>
<parser class="org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006.Word2006MLParser"/>
<parser class="org.apache.tika.parser.microsoft.xml.SpreadsheetMLParser"/>
<parser class="org.apache.tika.parser.microsoft.xml.WordMLParser"/>
<parser class="org.apache.tika.parser.odf.OpenDocumentParser"/>
<parser class="org.apache.tika.parser.pdf.PDFParser"/>
<parser class="org.apache.tika.parser.rtf.RTFParser"/>
<parser class="org.apache.tika.parser.txt.TXTParser"/>
<parser class="org.apache.tika.parser.wordperfect.QuattroProParser"/>
<parser class="org.apache.tika.parser.wordperfect.WordPerfectParser"/>
<parser class="org.apache.tika.parser.xml.DcXMLParser"/>
<parser class="org.apache.tika.parser.xml.FictionBookParser"/>
</parser>
<!-- below exclusions derived/inherited from default Jackrabbit tika-config.xml -->
<parser class="org.apache.tika.parser.EmptyParser">
<!-- Disable package extraction as it's too resource-intensive -->
<mime>application/x-archive</mime>
<mime>application/x-bzip</mime>
<mime>application/x-bzip2</mime>
<mime>application/x-cpio</mime>
<mime>application/x-gtar</mime>
<mime>application/x-gzip</mime>
<mime>application/x-tar</mime>
<mime>application/zip</mime>
<!-- Disable image extraction as there's no text to be found -->
<mime>image/bmp</mime>
<mime>image/gif</mime>
<mime>image/jpeg</mime>
<mime>image/png</mime>
<mime>image/vnd.wap.wbmp</mime>
<mime>image/x-icon</mime>
<mime>image/x-psd</mime>
<mime>image/x-xcf</mime>
</parser>
</parsers>
<!-- ignore Tika Parsers warnings for missing *optional* libraries -->
<service-loader initializableProblemHandler="ignore"/>
</properties>
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment