Skip to content

Commit 722438d

Browse files
committed
MinHashAnalyzer
1 parent 96d8b71 commit 722438d

File tree

5 files changed

+161
-1
lines changed

5 files changed

+161
-1
lines changed

src/main/java/com/arangodb/entity/arangosearch/AnalyzerType.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,6 @@ public enum AnalyzerType {
3838
segmentation,
3939
collation,
4040
classification,
41-
nearest_neighbors
41+
nearest_neighbors,
42+
minhash
4243
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
/*
2+
* DISCLAIMER
3+
*
4+
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*
18+
* Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
*/
20+
21+
package com.arangodb.entity.arangosearch.analyzer;
22+
23+
24+
import com.arangodb.entity.arangosearch.AnalyzerType;
25+
26+
import java.util.Objects;
27+
28+
/**
29+
* An Analyzer that computes so called MinHash signatures using a locality-sensitive hash function. It applies an
30+
* Analyzer of your choice before the hashing, for example, to break up text into words.
31+
*
32+
* @author Michele Rastelli
33+
* @see <a href= "https://www.arangodb.com/docs/stable/analyzers.html#minhash">API Documentation</a>
34+
* @since ArangoDB 3.10
35+
*/
36+
public class MinHashAnalyzer extends SearchAnalyzer {
37+
public MinHashAnalyzer() {
38+
setType(AnalyzerType.minhash);
39+
}
40+
41+
private MinHashAnalyzerProperties properties;
42+
43+
public MinHashAnalyzerProperties getProperties() {
44+
return properties;
45+
}
46+
47+
public void setProperties(MinHashAnalyzerProperties properties) {
48+
this.properties = properties;
49+
}
50+
51+
@Override
52+
public boolean equals(Object o) {
53+
if (this == o) return true;
54+
if (o == null || getClass() != o.getClass()) return false;
55+
if (!super.equals(o)) return false;
56+
MinHashAnalyzer that = (MinHashAnalyzer) o;
57+
return Objects.equals(properties, that.properties);
58+
}
59+
60+
@Override
61+
public int hashCode() {
62+
return Objects.hash(super.hashCode(), properties);
63+
}
64+
}
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* DISCLAIMER
3+
*
4+
* Copyright 2016 ArangoDB GmbH, Cologne, Germany
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License");
7+
* you may not use this file except in compliance with the License.
8+
* You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*
18+
* Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
*/
20+
21+
package com.arangodb.entity.arangosearch.analyzer;
22+
23+
24+
import java.util.Objects;
25+
26+
/**
27+
* @author Michele Rastelli
28+
* @since ArangoDB 3.10
29+
*/
30+
public class MinHashAnalyzerProperties {
31+
32+
private SearchAnalyzer analyzer;
33+
private Integer numHashes;
34+
35+
public SearchAnalyzer getAnalyzer() {
36+
return analyzer;
37+
}
38+
39+
public void setAnalyzer(SearchAnalyzer analyzer) {
40+
this.analyzer = analyzer;
41+
}
42+
43+
public Integer getNumHashes() {
44+
return numHashes;
45+
}
46+
47+
public void setNumHashes(Integer numHashes) {
48+
this.numHashes = numHashes;
49+
}
50+
51+
@Override
52+
public boolean equals(Object o) {
53+
if (this == o) return true;
54+
if (o == null || getClass() != o.getClass()) return false;
55+
MinHashAnalyzerProperties that = (MinHashAnalyzerProperties) o;
56+
return Objects.equals(analyzer, that.analyzer) && Objects.equals(numHashes, that.numHashes);
57+
}
58+
59+
@Override
60+
public int hashCode() {
61+
return Objects.hash(analyzer, numHashes);
62+
}
63+
}

src/main/java/com/arangodb/internal/velocypack/VPackDeserializers.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ public class VPackDeserializers {
106106
return context.deserialize(vpack, ClassificationAnalyzer.class);
107107
case nearest_neighbors:
108108
return context.deserialize(vpack, NearestNeighborsAnalyzer.class);
109+
case minhash:
110+
return context.deserialize(vpack, MinHashAnalyzer.class);
109111
default:
110112
throw new IllegalArgumentException("Unknown analyzer type: " + type);
111113
}

src/test/java/com/arangodb/ArangoSearchTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,36 @@ void nearestNeighborsAnalyzer(ArangoDatabase db) {
10341034
createGetAndDeleteTypedAnalyzer(db, analyzer);
10351035
}
10361036

1037+
@ParameterizedTest(name = "{index}")
1038+
@MethodSource("dbs")
1039+
void MinHashAnalyzer(ArangoDatabase db) {
1040+
assumeTrue(isAtLeastVersion(3, 10));
1041+
assumeTrue(isEnterprise());
1042+
1043+
SegmentationAnalyzerProperties segProperties = new SegmentationAnalyzerProperties();
1044+
segProperties.setBreakMode(SegmentationAnalyzerProperties.BreakMode.alpha);
1045+
segProperties.setAnalyzerCase(SearchAnalyzerCase.lower);
1046+
1047+
SegmentationAnalyzer segAnalyzer = new SegmentationAnalyzer();
1048+
segAnalyzer.setProperties(segProperties);
1049+
1050+
MinHashAnalyzerProperties properties = new MinHashAnalyzerProperties();
1051+
properties.setAnalyzer(segAnalyzer);
1052+
properties.setNumHashes(2);
1053+
1054+
Set<AnalyzerFeature> features = new HashSet<>();
1055+
features.add(AnalyzerFeature.frequency);
1056+
features.add(AnalyzerFeature.norm);
1057+
features.add(AnalyzerFeature.position);
1058+
1059+
MinHashAnalyzer analyzer = new MinHashAnalyzer();
1060+
analyzer.setName("test-" + UUID.randomUUID());
1061+
analyzer.setProperties(properties);
1062+
analyzer.setFeatures(features);
1063+
1064+
createGetAndDeleteTypedAnalyzer(db, analyzer);
1065+
}
1066+
10371067
@ParameterizedTest(name = "{index}")
10381068
@MethodSource("dbs")
10391069
void offsetFeature(ArangoDatabase db) {

0 commit comments

Comments
 (0)