Skip to content

Commit b924d67

Browse files
authored
Merge pull request #2076 from dadoonet/es-9
Add support for Elasticsearch 9
2 parents c0589a3 + e286e59 commit b924d67

File tree

50 files changed

+597
-261
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+597
-261
lines changed

.github/workflows/pr.yml

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
- name: Build but the docker images
2929
run: mvn --batch-mode install -Ddocker.skip -DskipIntegTests
3030

31-
# We run integration tests with elastic stack 8 (default)
31+
# We run integration tests with elastic stack 9 (default)
3232
it:
3333
runs-on: ubuntu-latest
3434
needs: build
@@ -48,6 +48,26 @@ jobs:
4848
- name: Run the integration tests
4949
run: mvn --batch-mode install -Ddocker.skip -DskipUnitTests -Dtests.parallelism=1 -Dtests.output=always -Dtests.leaveTemporary=false
5050

51+
# We run integration tests with elastic stack 8
52+
it-es8:
53+
runs-on: ubuntu-latest
54+
needs: build
55+
steps:
56+
- uses: actions/checkout@v4
57+
- name: Set up JDK 21
58+
uses: actions/setup-java@v4
59+
with:
60+
java-version: '21'
61+
distribution: 'temurin'
62+
cache: 'maven'
63+
- name: Cache Docker images
64+
uses: AndreKurait/[email protected]
65+
with:
66+
key: fscrawler-docker-cache-${{ runner.os }}-${{ hashFiles('pom.xml') }}
67+
continue-on-error: true
68+
- name: Run the integration tests
69+
run: mvn --batch-mode install -Ddocker.skip -DskipUnitTests -Pes-8x -Dtests.parallelism=1 -Dtests.leaveTemporary=false
70+
5171
# We run integration tests with elastic stack 7
5272
it-es7:
5373
runs-on: ubuntu-latest

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ This crawler helps to index binary documents such as PDF, Open Office, MS Office
1414

1515
Current "most stable" versions are:
1616

17-
| Elasticsearch | FS Crawler | Released | Docs |
18-
|---------------|---------------|------------|-------------------------------------------------------------------------------|
19-
| 6.x, 7.x, 8.x | 2.10-SNAPSHOT | | [2.10-SNAPSHOT](https://fscrawler.readthedocs.io/en/latest/) |
17+
| Elasticsearch | FS Crawler | Released | Docs |
18+
|--------------------|---------------|------------|-------------------------------------------------------------------------------|
19+
| 6.x, 7.x, 8.x, 9.x | 2.10-SNAPSHOT | | [2.10-SNAPSHOT](https://fscrawler.readthedocs.io/en/latest/) |
2020

2121
[![Maven Central](https://img.shields.io/maven-central/v/fr.pilato.elasticsearch.crawler/fscrawler-distribution)](https://repo1.maven.org/maven2/fr/pilato/elasticsearch/crawler/fscrawler-distribution/)
2222
![GitHub Release Date](https://img.shields.io/github/release-date/dadoonet/fscrawler)

contrib/docker-compose-example-elasticsearch/.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ ELASTIC_PASSWORD=changeme
1414
KIBANA_PASSWORD=changeme
1515

1616
# Version of Elastic products
17-
STACK_VERSION=8.17.4
17+
STACK_VERSION=9.0.0
1818

1919
# Set the cluster name
2020
CLUSTER_NAME=docker-cluster

contrib/docker-compose-it-v7/.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ ELASTIC_PASSWORD=changeme
77
KIBANA_PASSWORD=changeme
88

99
# Version of Elastic products
10-
STACK_VERSION=7.17.23
10+
STACK_VERSION=7.17.28
1111

1212
# Set the cluster name
1313
CLUSTER_NAME=docker-cluster

contrib/docker-compose-it/.env

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ ELASTIC_PASSWORD=changeme
77
KIBANA_PASSWORD=changeme
88

99
# Version of Elastic products
10-
STACK_VERSION=8.17.4
10+
STACK_VERSION=9.0.0
1111

1212
# Set the cluster name
1313
CLUSTER_NAME=docker-cluster

core/src/main/java/fr/pilato/elasticsearch/crawler/fs/service/FsCrawlerManagementServiceElasticsearchImpl.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ public Collection<String> getFileDirectory(String path)
8888
Collection<String> files = new ArrayList<>();
8989

9090
try {
91+
// This call is just to avoid errors if the index is not fully allocated yet
92+
client.waitForHealthyIndex(settings.getElasticsearch().getIndex());
9193
ESSearchResponse response = client.search(
9294
new ESSearchRequest()
9395
.withIndex(settings.getElasticsearch().getIndex())
@@ -125,6 +127,8 @@ public Collection<String> getFolderDirectory(String path) throws Exception {
125127
Collection<String> files = new ArrayList<>();
126128

127129
try {
130+
// This call is just to avoid errors if the index is not fully allocated yet
131+
client.waitForHealthyIndex(settings.getElasticsearch().getIndexFolder());
128132
ESSearchResponse response = client.search(
129133
new ESSearchRequest()
130134
.withIndex(settings.getElasticsearch().getIndexFolder())

crawler/crawler-ftp/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ftp/FileAbstractorFTP.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,13 @@ public void open() throws IOException {
180180

181181
@Override
182182
public void close() throws IOException {
183-
if (ftp.isConnected()) {
184-
ftp.logout();
185-
ftp.disconnect();
183+
try {
184+
if (ftp.isConnected()) {
185+
ftp.logout();
186+
ftp.disconnect();
187+
}
188+
} catch (IOException e) {
189+
logger.warn("Error during FTP logout: {}", e.getMessage());
186190
}
187191
}
188192

distribution/pom.xml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -181,16 +181,6 @@
181181
<goal>push</goal>
182182
</goals>
183183
</execution>
184-
<execution>
185-
<!-- There is no integration test, so we can skip that execution -->
186-
<id>start-elasticsearch</id>
187-
<phase>none</phase>
188-
</execution>
189-
<execution>
190-
<!-- There is no integration test, so we can skip that execution -->
191-
<id>stop-elasticsearch</id>
192-
<phase>none</phase>
193-
</execution>
194184
</executions>
195185
</plugin>
196186
</plugins>

docs/source/admin/fs/elasticsearch.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ by setting ``push_templates`` to ``false``:
112112
push_templates: false
113113
114114
If you want to know what are the component templates and index templates
115-
that will be created, you can get them from `the source <https://github.com/dadoonet/fscrawler/blob/master/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/8>`__.
115+
that will be created, you can get them from `the source <https://github.com/dadoonet/fscrawler/blob/master/elasticsearch-client/src/main/resources/fr/pilato/elasticsearch/crawler/fs/client/9>`__.
116116

117117
Creating your own mapping (analyzers)
118118
"""""""""""""""""""""""""""""""""""""

docs/source/conf.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -221,28 +221,31 @@ def read_version(full_version=True):
221221
.. |ES_version6| replace:: Elasticsearch {fmt_es_version6}
222222
.. |ES_version7| replace:: Elasticsearch {fmt_es_version7}
223223
.. |ES_version8| replace:: Elasticsearch {fmt_es_version8}
224+
.. |ES_version9| replace:: Elasticsearch {fmt_es_version9}
224225
.. |JPEG2000_version| replace:: jai-imageio-jpeg2000:{fmt_jpeg_version}
225226
.. |Download_URL| replace:: Sonatype
226227
.. |Maven_Central| replace:: Maven Central
227228
.. |Sonatype| replace:: Sonatype
228229
229230
.. _Tika: https://tika.apache.org/{fmt_tika_version}/
230-
.. _ES: https://www.elastic.co/products/elasticsearch
231+
.. _ES: https://www.elastic.co/elasticsearch
231232
.. _Tika_format: https://tika.apache.org/{fmt_tika_version}/formats.html#Supported_Document_Formats
232233
.. _Tika_version: https://tika.apache.org/{fmt_tika_version}/
233234
.. _Tika_configuring: https://tika.apache.org/{fmt_tika_version}/configuring.html
234-
.. _ES_version6: https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html
235-
.. _ES_version7: https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html
236-
.. _ES_version8: https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html
235+
.. _ES_version6: https://www.elastic.co/guide/en/elasticsearch/reference/6.8/index.html
236+
.. _ES_version7: https://www.elastic.co/guide/en/elasticsearch/reference/7.17/index.html
237+
.. _ES_version8: https://www.elastic.co/guide/en/elasticsearch/reference/8.18/index.html
238+
.. _ES_version9: https://www.elastic.co/docs/solutions/search
237239
.. _JPEG2000_version: https://repo1.maven.org/maven2/com/github/jai-imageio/jai-imageio-jpeg2000/{fmt_jpeg_version}/
238240
.. _Download_URL: {fmt_downloadUrl}
239241
.. _Maven_Central: https://repo1.maven.org/maven2/fr/pilato/elasticsearch/crawler/fscrawler-distribution/
240-
.. _Sonatype: https://s01.oss.sonatype.org/content/repositories/snapshots/fr/pilato/elasticsearch/crawler/fscrawler-distribution/
242+
.. _Sonatype: https://central.sonatype.com/service/rest/repository/browse/maven-snapshots/fr/pilato/elasticsearch/crawler/fscrawler-distribution/
241243
""".format(
242244
fmt_tika_version=config.get('3rdParty', 'TikaVersion'),
243245
fmt_es_version6=config.get('3rdParty', 'ElasticsearchVersion6'),
244246
fmt_es_version7=config.get('3rdParty', 'ElasticsearchVersion7'),
245247
fmt_es_version8=config.get('3rdParty', 'ElasticsearchVersion8'),
248+
fmt_es_version9=config.get('3rdParty', 'ElasticsearchVersion9'),
246249
fmt_jpeg_version=config.get('3rdParty', 'JpegVersion'),
247250
fmt_downloadUrl=downloadUrl,
248251
fmt_release=release

docs/source/fscrawler.ini

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ Version=2.10-SNAPSHOT
44
[3rdParty]
55
TikaVersion=3.1.0
66
ElasticsearchVersion6=6.8.23
7-
ElasticsearchVersion7=7.17.23
8-
ElasticsearchVersion8=8.17.4
7+
ElasticsearchVersion7=7.17.28
8+
ElasticsearchVersion8=8.18.0
9+
ElasticsearchVersion9=9.0.0
910
JpegVersion=1.4.0

docs/source/index.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ This crawler helps to index binary documents such as PDF, Open Office, MS Office
2222

2323
FS Crawler |release| is using |Tika_version|_ and is tested against:
2424

25+
* |ES_version9|_.
2526
* |ES_version8|_.
26-
* |ES_version7|_.
27+
* |ES_version7|_. (Deprecated)
2728
* |ES_version6|_. (Deprecated)
2829

2930
.. toctree::

docs/source/installation.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ Prepare a ``.env`` file with the following content:
185185
FS_JAVA_OPTS="-DLOG_LEVEL=debug -DDOC_LEVEL=debug"
186186
187187
# Chenge the STACK_VERSION if needed
188-
STACK_VERSION=8.17.3
188+
STACK_VERSION=9.0.0
189189
ELASTIC_PASSWORD=changeme
190190
KIBANA_PASSWORD=changeme
191191
CLUSTER_NAME=docker-cluster

docs/source/release/2.10.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ New
3737
* Add temporal information in folder index. Thanks to bdauvissat
3838
* Add support for external metadata files while crawling, defaults to ``.meta.yml``. See :ref:`tags` Thanks to dadoonet.
3939
* The job name is not mandatory anymore and it will be ``fscrawler`` by default. Thanks to dadoonet.
40+
* FSCrawler also supports Elasticsearch 9. Thanks to dadoonet.
4041

4142
Fix
4243
---

docs/source/user/getting_started.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ you want to index in it and start again:
3333
.. code:: sh
3434
3535
$ bin/fscrawler
36-
17:41:45,395 INFO [f.p.e.c.f.FsCrawlerImpl] FSCrawler is now connected to Elasticsearch version [8.17.3]
36+
17:41:45,395 INFO [f.p.e.c.f.FsCrawlerImpl] FSCrawler is now connected to Elasticsearch version [9.0.0]
3737
17:41:45,395 INFO [f.p.e.c.f.FsCrawlerImpl] FSCrawler started in watch mode. It will run unless you stop it with CTRL+C.
3838
17:41:45,395 INFO [f.p.e.c.f.FsParserAbstract] FS crawler started for [fscrawler] for [/tmp/es] every [15m]
3939

docs/source/user/options.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ option:
2222
.. code:: sh
2323
2424
$ bin/fscrawler --loop 1
25-
17:41:45,395 INFO [f.p.e.c.f.FsCrawlerImpl] FSCrawler is now connected to Elasticsearch version [8.17.3]
25+
17:41:45,395 INFO [f.p.e.c.f.FsCrawlerImpl] FSCrawler is now connected to Elasticsearch version [9.0.0]
2626
17:41:45,395 INFO [f.p.e.c.f.FsCrawlerImpl] FSCrawler started in watch mode. It will run unless you stop it with CTRL+C.
2727
17:41:45,395 INFO [f.p.e.c.f.FsParserAbstract] FS crawler started for [fscrawler] for [/tmp/es] every [15m]
2828
17:44:57,865 INFO [f.p.e.c.f.FsParserAbstract] Run #1: job [fscrawler]: starting...

docs/src/main/resources/fscrawler.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ TikaVersion=${tika.version}
66
ElasticsearchVersion6=${elasticsearch6.version}
77
ElasticsearchVersion7=${elasticsearch7.version}
88
ElasticsearchVersion8=${elasticsearch8.version}
9+
ElasticsearchVersion9=${elasticsearch9.version}
910
JpegVersion=${jpeg.version}

elasticsearch-client/pom.xml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,6 @@
6363
<artifactId>fscrawler-test-framework</artifactId>
6464
<scope>test</scope>
6565
</dependency>
66-
<dependency>
67-
<groupId>org.testcontainers</groupId>
68-
<artifactId>elasticsearch</artifactId>
69-
<scope>test</scope>
70-
</dependency>
7166
<dependency>
7267
<groupId>org.testcontainers</groupId>
7368
<artifactId>nginx</artifactId>

elasticsearch-client/src/main/java/fr/pilato/elasticsearch/crawler/fs/client/ElasticsearchClient.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ public void start() throws ElasticsearchClientException {
191191

192192
if (semanticSearch) {
193193
// Check the version we are running or if it's using serverless
194-
if ((majorVersion >= 8 && minorVersion >= 17) || serverless) {
194+
if ((majorVersion >= 8 && minorVersion >= 17) || serverless || (majorVersion >= 9)) {
195195
logger.debug("Semantic search is enabled and we are running on a version of Elasticsearch {} " +
196196
"which is 8.17 or higher. We will try to use the semantic search features.", version);
197197
license = getLicense();
@@ -474,11 +474,17 @@ public void waitForHealthyIndex(String index) throws ElasticsearchClientExceptio
474474
private String catIndicesHealth(String index) {
475475
try {
476476
String response = httpGet("_cat/indices/" + index,
477-
new AbstractMap.SimpleImmutableEntry<>("h", "health"));
477+
new AbstractMap.SimpleImmutableEntry<>("h", "health"));
478478
DocumentContext document = parseJsonAsDocumentContext(response);
479479
String health = document.read("$[0].health");
480480
logger.trace("index [{}] health: [{}]", index, health);
481481
return health;
482+
} catch (WebApplicationException e) {
483+
if (e.getResponse().getStatus() == Response.Status.NOT_FOUND.getStatusCode()) {
484+
logger.debug("Index [{}] not found yet", index);
485+
return null;
486+
}
487+
throw e;
482488
} catch (ElasticsearchClientException e) {
483489
throw new RuntimeException(e);
484490
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"template": {
3+
"aliases": {
4+
"fscrawler": { }
5+
}
6+
}
7+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
"template": {
3+
"mappings": {
4+
"properties": {
5+
"attachment": {
6+
"type": "binary",
7+
"doc_values": false
8+
}
9+
}
10+
}
11+
}
12+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
{
2+
"template": {
3+
"mappings": {
4+
"properties": {
5+
"attributes": {
6+
"properties": {
7+
"group": {
8+
"type": "keyword"
9+
},
10+
"owner": {
11+
"type": "keyword"
12+
}
13+
}
14+
}
15+
}
16+
}
17+
}
18+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"template": {
3+
"mappings": {
4+
"properties": {
5+
"content": {
6+
"type": "text"
7+
}
8+
}
9+
}
10+
}
11+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"template": {
3+
"mappings": {
4+
"properties": {
5+
"content": {
6+
"type": "text",
7+
"copy_to": [ "content_semantic" ]
8+
},
9+
"content_semantic": {
10+
"type": "semantic_text"
11+
}
12+
}
13+
}
14+
}
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"template": {
3+
"mappings": {
4+
"properties": {
5+
"content": {
6+
"type": "text"
7+
},
8+
"content_vector": {
9+
"type": "dense_vector"
10+
}
11+
}
12+
}
13+
}
14+
}

0 commit comments

Comments
 (0)