Skip to content

Commit 312f5c3

Browse files
authored
refactor doc mapper (#5522)
* refactor doc mapper move tantivy val to json to own module remove duplicated conversion logic move field_presence to own module * use base64 for bytes
1 parent 3254064 commit 312f5c3

File tree

5 files changed

+564
-482
lines changed

5 files changed

+564
-482
lines changed

quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs

Lines changed: 5 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ use std::num::NonZeroU32;
2222

2323
use anyhow::{bail, Context};
2424
use fnv::FnvHashSet;
25-
use quickwit_common::PathHasher;
2625
use quickwit_proto::types::DocMappingUid;
2726
use quickwit_query::create_default_quickwit_tokenizer_manager;
2827
use quickwit_query::query_ast::QueryAst;
@@ -31,13 +30,12 @@ use serde::{Deserialize, Serialize};
3130
use serde_json::{self, Value as JsonValue};
3231
use serde_json_borrow::Map as BorrowedJsonMap;
3332
use tantivy::query::Query;
34-
use tantivy::schema::document::{ReferenceValue, ReferenceValueLeaf};
35-
use tantivy::schema::{
36-
Field, FieldType, OwnedValue as TantivyValue, Schema, Value, INDEXED, STORED,
37-
};
33+
use tantivy::schema::{Field, FieldType, OwnedValue as TantivyValue, Schema, INDEXED, STORED};
3834
use tantivy::TantivyDocument as Document;
3935

4036
use super::field_mapping_entry::RAW_TOKENIZER_NAME;
37+
use super::field_presence::populate_field_presence;
38+
use super::tantivy_val_to_json::tantivy_value_to_json;
4139
use super::DocMapperBuilder;
4240
use crate::doc_mapper::mapping_tree::{
4341
build_field_path_from_str, build_mapping_tree, map_primitive_json_to_tantivy,
@@ -430,85 +428,6 @@ fn extract_single_obj(
430428
}
431429
}
432430

433-
// TODO: Formatting according to mapper if applicable
434-
fn tantivy_value_to_json(val: TantivyValue) -> JsonValue {
435-
match val {
436-
TantivyValue::Null => JsonValue::Null,
437-
TantivyValue::Str(val) => JsonValue::String(val),
438-
TantivyValue::PreTokStr(val) => JsonValue::String(val.text),
439-
TantivyValue::U64(val) => JsonValue::Number(val.into()),
440-
TantivyValue::I64(val) => JsonValue::Number(val.into()),
441-
TantivyValue::F64(val) => serde_json::json!(val),
442-
TantivyValue::Bool(val) => JsonValue::Bool(val),
443-
TantivyValue::Date(val) => JsonValue::String(format!("{:?}", val)),
444-
TantivyValue::Facet(val) => JsonValue::String(val.to_string()),
445-
TantivyValue::Bytes(val) => JsonValue::String(format!("{:?}", val)),
446-
TantivyValue::Array(val) => val.into_iter().map(tantivy_value_to_json).collect(),
447-
TantivyValue::Object(val) => val
448-
.into_iter()
449-
.map(|(key, val)| (key, tantivy_value_to_json(val)))
450-
.collect(),
451-
TantivyValue::IpAddr(val) => JsonValue::String(format!("{:?}", val)),
452-
}
453-
}
454-
455-
#[inline]
456-
fn populate_field_presence_for_json_value<'a>(
457-
json_value: impl Value<'a>,
458-
path_hasher: &PathHasher,
459-
is_expand_dots_enabled: bool,
460-
output: &mut FnvHashSet<u64>,
461-
) {
462-
match json_value.as_value() {
463-
ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
464-
ReferenceValue::Leaf(_) => {
465-
output.insert(path_hasher.finish());
466-
}
467-
ReferenceValue::Array(items) => {
468-
for item in items {
469-
populate_field_presence_for_json_value(
470-
item,
471-
path_hasher,
472-
is_expand_dots_enabled,
473-
output,
474-
);
475-
}
476-
}
477-
ReferenceValue::Object(json_obj) => {
478-
populate_field_presence_for_json_obj(
479-
json_obj,
480-
path_hasher.clone(),
481-
is_expand_dots_enabled,
482-
output,
483-
);
484-
}
485-
}
486-
}
487-
488-
fn populate_field_presence_for_json_obj<'a, Iter: Iterator<Item = (&'a str, impl Value<'a>)>>(
489-
json_obj: Iter,
490-
path_hasher: PathHasher,
491-
is_expand_dots_enabled: bool,
492-
output: &mut FnvHashSet<u64>,
493-
) {
494-
for (field_key, field_value) in json_obj {
495-
let mut child_path_hasher = path_hasher.clone();
496-
if is_expand_dots_enabled {
497-
for segment in field_key.split('.') {
498-
child_path_hasher.append(segment.as_bytes());
499-
}
500-
} else {
501-
child_path_hasher.append(field_key.as_bytes());
502-
};
503-
populate_field_presence_for_json_value(
504-
field_value,
505-
&child_path_hasher,
506-
is_expand_dots_enabled,
507-
output,
508-
);
509-
}
510-
}
511-
512431
impl DocMapper {
513432
/// Returns the unique identifier of the doc mapping.
514433
pub fn doc_mapping_uid(&self) -> DocMappingUid {
@@ -636,36 +555,9 @@ impl DocMapper {
636555
document.add_u64(document_size_field, document_len);
637556
}
638557

639-
// The capacity is inexact here.
640-
641558
if self.index_field_presence {
642-
let mut field_presence_hashes: FnvHashSet<u64> =
643-
FnvHashSet::with_capacity_and_hasher(document.len(), Default::default());
644-
for (field, value) in document.field_values() {
645-
let field_entry = self.schema.get_field_entry(field);
646-
if !field_entry.is_indexed() || field_entry.is_fast() {
647-
// We are using an tantivy's ExistsQuery for fast fields.
648-
continue;
649-
}
650-
let mut path_hasher: PathHasher = PathHasher::default();
651-
path_hasher.append(&field.field_id().to_le_bytes()[..]);
652-
if let Some(json_obj) = value.as_object() {
653-
let is_expand_dots_enabled: bool =
654-
if let FieldType::JsonObject(json_options) = field_entry.field_type() {
655-
json_options.is_expand_dots_enabled()
656-
} else {
657-
false
658-
};
659-
populate_field_presence_for_json_obj(
660-
json_obj,
661-
path_hasher,
662-
is_expand_dots_enabled,
663-
&mut field_presence_hashes,
664-
);
665-
} else {
666-
field_presence_hashes.insert(path_hasher.finish());
667-
}
668-
}
559+
let field_presence_hashes: FnvHashSet<u64> =
560+
populate_field_presence(&document, &self.schema);
669561
for field_presence_hash in field_presence_hashes {
670562
document.add_field_value(FIELD_PRESENCE_FIELD, &field_presence_hash);
671563
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Copyright (C) 2024 Quickwit, Inc.
2+
//
3+
// Quickwit is offered under the AGPL v3.0 and as commercial software.
4+
// For commercial licensing, contact us at [email protected].
5+
//
6+
// AGPL:
7+
// This program is free software: you can redistribute it and/or modify
8+
// it under the terms of the GNU Affero General Public License as
9+
// published by the Free Software Foundation, either version 3 of the
10+
// License, or (at your option) any later version.
11+
//
12+
// This program is distributed in the hope that it will be useful,
13+
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
// GNU Affero General Public License for more details.
16+
//
17+
// You should have received a copy of the GNU Affero General Public License
18+
// along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
20+
use fnv::FnvHashSet;
21+
use quickwit_common::PathHasher;
22+
use tantivy::schema::document::{ReferenceValue, ReferenceValueLeaf};
23+
use tantivy::schema::{FieldType, Schema, Value};
24+
use tantivy::Document;
25+
26+
/// Populates the field presence for a document.
27+
///
28+
/// The field presence is a set of hashes that represent the fields that are present in the
29+
/// document. Each hash is computed from the field path.
30+
///
31+
/// It is only added if the field is indexed and not fast.
32+
pub(crate) fn populate_field_presence<D: Document>(
33+
document: &D,
34+
schema: &Schema,
35+
) -> FnvHashSet<u64> {
36+
let mut field_presence_hashes: FnvHashSet<u64> =
37+
FnvHashSet::with_capacity_and_hasher(schema.num_fields(), Default::default());
38+
for (field, value) in document.iter_fields_and_values() {
39+
let field_entry = schema.get_field_entry(field);
40+
if !field_entry.is_indexed() || field_entry.is_fast() {
41+
// We are using an tantivy's ExistsQuery for fast fields.
42+
continue;
43+
}
44+
let mut path_hasher: PathHasher = PathHasher::default();
45+
path_hasher.append(&field.field_id().to_le_bytes()[..]);
46+
if let Some(json_obj) = value.as_object() {
47+
let is_expand_dots_enabled: bool =
48+
if let FieldType::JsonObject(json_options) = field_entry.field_type() {
49+
json_options.is_expand_dots_enabled()
50+
} else {
51+
false
52+
};
53+
populate_field_presence_for_json_obj(
54+
json_obj,
55+
path_hasher,
56+
is_expand_dots_enabled,
57+
&mut field_presence_hashes,
58+
);
59+
} else {
60+
field_presence_hashes.insert(path_hasher.finish());
61+
}
62+
}
63+
field_presence_hashes
64+
}
65+
66+
#[inline]
67+
fn populate_field_presence_for_json_value<'a>(
68+
json_value: impl Value<'a>,
69+
path_hasher: &PathHasher,
70+
is_expand_dots_enabled: bool,
71+
output: &mut FnvHashSet<u64>,
72+
) {
73+
match json_value.as_value() {
74+
ReferenceValue::Leaf(ReferenceValueLeaf::Null) => {}
75+
ReferenceValue::Leaf(_) => {
76+
output.insert(path_hasher.finish());
77+
}
78+
ReferenceValue::Array(items) => {
79+
for item in items {
80+
populate_field_presence_for_json_value(
81+
item,
82+
path_hasher,
83+
is_expand_dots_enabled,
84+
output,
85+
);
86+
}
87+
}
88+
ReferenceValue::Object(json_obj) => {
89+
populate_field_presence_for_json_obj(
90+
json_obj,
91+
path_hasher.clone(),
92+
is_expand_dots_enabled,
93+
output,
94+
);
95+
}
96+
}
97+
}
98+
99+
fn populate_field_presence_for_json_obj<'a, Iter: Iterator<Item = (&'a str, impl Value<'a>)>>(
100+
json_obj: Iter,
101+
path_hasher: PathHasher,
102+
is_expand_dots_enabled: bool,
103+
output: &mut FnvHashSet<u64>,
104+
) {
105+
for (field_key, field_value) in json_obj {
106+
let mut child_path_hasher = path_hasher.clone();
107+
if is_expand_dots_enabled {
108+
for segment in field_key.split('.') {
109+
child_path_hasher.append(segment.as_bytes());
110+
}
111+
} else {
112+
child_path_hasher.append(field_key.as_bytes());
113+
};
114+
populate_field_presence_for_json_value(
115+
field_value,
116+
&child_path_hasher,
117+
is_expand_dots_enabled,
118+
output,
119+
);
120+
}
121+
}

0 commit comments

Comments
 (0)