Skip to content

Commit cb58488

Browse files
committed
Deprecate CamelCase PathHierarchy tokenizer name
Deprecate CamelCase PathHierarchy tokenizer name in favor to lowercase path_hierarchy. Signed-off-by: Lukáš Vlček <[email protected]>
1 parent be65f54 commit cb58488

File tree

4 files changed

+76
-4
lines changed

4 files changed

+76
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6262
- Return 409 Conflict HTTP status instead of 503 on failure to concurrently execute snapshots ([#8986](https://github.com/opensearch-project/OpenSearch/pull/5855))
6363
- Add task completion count in search backpressure stats API ([#10028](https://github.com/opensearch-project/OpenSearch/pull/10028/))
6464
- Performance improvement for Datetime field caching ([#4558](https://github.com/opensearch-project/OpenSearch/issues/4558))
65+
- Deprecate CamelCase `PathHierarchy` tokenizer name in favor to lowercase `path_hierarchy` ([#10894](https://github.com/opensearch-project/OpenSearch/pull/10894))
6566

6667

6768
### Deprecated

modules/analysis-common/src/main/java/org/opensearch/analysis/common/CommonAnalysisModulePlugin.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,17 @@ public Map<String, AnalysisProvider<TokenizerFactory>> getTokenizers() {
394394
// TODO deprecate and remove in API
395395
tokenizers.put("lowercase", XLowerCaseTokenizerFactory::new);
396396
tokenizers.put("path_hierarchy", PathHierarchyTokenizerFactory::new);
397-
tokenizers.put("PathHierarchy", PathHierarchyTokenizerFactory::new);
397+
tokenizers.put("PathHierarchy", (IndexSettings indexSettings, Environment environment, String name, Settings settings) -> {
398+
// TODO Remove "PathHierarchy" tokenizer name in 4.0 and throw exception
399+
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_3_0_0)) {
400+
deprecationLogger.deprecate(
401+
"PathHierarchy_tokenizer_deprecation",
402+
"The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. "
403+
+ "Please change the tokenizer name to [path_hierarchy] instead."
404+
);
405+
}
406+
return new PathHierarchyTokenizerFactory(indexSettings, environment, name, settings);
407+
});
398408
tokenizers.put("pattern", PatternTokenizerFactory::new);
399409
tokenizers.put("uax_url_email", UAX29URLEmailTokenizerFactory::new);
400410
tokenizers.put("whitespace", WhitespaceTokenizerFactory::new);
@@ -662,8 +672,17 @@ public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
662672
}
663673
return new EdgeNGramTokenizer(NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE, NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
664674
}));
665-
tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new));
666-
675+
tokenizers.add(PreConfiguredTokenizer.openSearchVersion("PathHierarchy", (version) -> {
676+
// TODO Remove "PathHierarchy" tokenizer name in 4.0 and throw exception
677+
if (version.onOrAfter(Version.V_3_0_0)) {
678+
deprecationLogger.deprecate(
679+
"PathHierarchy_tokenizer_deprecation",
680+
"The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. "
681+
+ "Please change the tokenizer name to [path_hierarchy] instead."
682+
);
683+
}
684+
return new PathHierarchyTokenizer();
685+
}));
667686
return tokenizers;
668687
}
669688
}

modules/analysis-common/src/test/java/org/opensearch/analysis/common/PathHierarchyTokenizerFactoryTests.java

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,61 @@
3535
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
3636

3737
import org.apache.lucene.analysis.Tokenizer;
38+
import org.opensearch.Version;
39+
import org.opensearch.cluster.metadata.IndexMetadata;
3840
import org.opensearch.common.settings.Settings;
3941
import org.opensearch.core.index.Index;
42+
import org.opensearch.env.Environment;
43+
import org.opensearch.env.TestEnvironment;
44+
import org.opensearch.index.IndexSettings;
45+
import org.opensearch.index.analysis.IndexAnalyzers;
46+
import org.opensearch.index.analysis.NamedAnalyzer;
47+
import org.opensearch.indices.analysis.AnalysisModule;
4048
import org.opensearch.test.IndexSettingsModule;
4149
import org.opensearch.test.OpenSearchTokenStreamTestCase;
50+
import org.opensearch.test.VersionUtils;
4251

4352
import java.io.IOException;
4453
import java.io.StringReader;
54+
import java.util.Collections;
4555

4656
public class PathHierarchyTokenizerFactoryTests extends OpenSearchTokenStreamTestCase {
4757

58+
private IndexAnalyzers buildAnalyzers(Version version, String tokenizer) throws IOException {
59+
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
60+
Settings indexSettings = Settings.builder()
61+
.put(IndexMetadata.SETTING_VERSION_CREATED, version)
62+
.put("index.analysis.analyzer.my_analyzer.tokenizer", tokenizer)
63+
.build();
64+
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings);
65+
return new AnalysisModule(TestEnvironment.newEnvironment(settings), Collections.singletonList(new CommonAnalysisModulePlugin()))
66+
.getAnalysisRegistry()
67+
.build(idxSettings);
68+
}
69+
70+
/**
71+
* Test that deprecated "PathHierarchy" tokenizer name is still available via {@link CommonAnalysisModulePlugin} starting in 3.x.
72+
*/
73+
public void testPreConfiguredTokenizer() throws IOException {
74+
75+
{
76+
try (
77+
IndexAnalyzers indexAnalyzers = buildAnalyzers(
78+
VersionUtils.randomVersionBetween(random(), Version.V_3_0_0, Version.CURRENT),
79+
"PathHierarchy"
80+
)
81+
) {
82+
NamedAnalyzer analyzer = indexAnalyzers.get("my_analyzer");
83+
assertNotNull(analyzer);
84+
assertTokenStreamContents(analyzer.tokenStream("dummy", "/a/b/c"), new String[] { "/a", "/a/b", "/a/b/c" });
85+
// Once LUCENE-12750 is fixed we can use the following testing method instead.
86+
// Similar testing approach has been used for deprecation of (Edge)NGrams tokenizers as well.
87+
// assertAnalyzesTo(analyzer, "/a/b/c", new String[] { "/a", "/a/b", "/a/b/c" });
88+
89+
}
90+
}
91+
}
92+
4893
public void testDefaults() throws IOException {
4994
final Index index = new Index("test", "_na_");
5095
final Settings indexSettings = newAnalysisSettingsBuilder().build();

modules/analysis-common/src/yamlRestTest/resources/rest-api-spec/test/analysis-common/30_tokenizers.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,9 @@
298298

299299
---
300300
"path_hierarchy":
301+
- skip:
302+
features: "allowed_warnings"
303+
301304
- do:
302305
indices.analyze:
303306
body:
@@ -312,6 +315,8 @@
312315
- match: { detail.tokenizer.tokens.2.token: a/b/c }
313316

314317
- do:
318+
allowed_warnings:
319+
- 'The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. Please change the tokenizer name to [path_hierarchy] instead.'
315320
indices.analyze:
316321
body:
317322
text: "a/b/c"
@@ -337,11 +342,13 @@
337342
- match: { detail.tokenizer.tokens.2.token: a/b/c }
338343

339344
- do:
345+
allowed_warnings:
346+
- 'The [PathHierarchy] tokenizer name is deprecated and will be removed in a future version. Please change the tokenizer name to [path_hierarchy] instead.'
340347
indices.analyze:
341348
body:
342349
text: "a/b/c"
343350
explain: true
344-
tokenizer: PathHierarchy
351+
tokenizer: PathHierarchy
345352
- length: { detail.tokenizer.tokens: 3 }
346353
- match: { detail.tokenizer.name: PathHierarchy }
347354
- match: { detail.tokenizer.tokens.0.token: a }

0 commit comments

Comments
 (0)