Skip to content

Commit

Permalink
migrate(smart-autocomplete): Add a hashmap optimization + BF index to…
Browse files Browse the repository at this point in the history
… smart autocomplete table (#6875)

Using a similar trick as the tags hashmap optimization, store a hash of
all the keys as an array so we can look up the existence of an attribute
much faster (no need for string compares), also add a bloom filter index
onto that column as well.


With this trick we can add the following statement to a prewhere to
speed up the query:

```
has(_str_attr_keys_hash_map, cityHash64('sentry.op'))
```
  • Loading branch information
volokluev authored Feb 12, 2025
1 parent 007836d commit 39530ad
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 0 deletions.
4 changes: 4 additions & 0 deletions snuba/datasets/storages/tags_hash_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@
)


def get_array_vals_hash(col_name: str) -> str:
return f"arrayMap(k -> cityHash64(k), {col_name})"


def hash_map_int_column_definition(key_column_name: str, value_column_name: str) -> str:
return (
f"arrayMap((k, v) -> cityHash64(concat(toString(k), '=', toString(v))), "
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
from typing import Sequence

from snuba.clusters.storage_sets import StorageSetKey
from snuba.datasets.storages.tags_hash_map import get_array_vals_hash
from snuba.migrations import migration, operations
from snuba.migrations.columns import MigrationModifiers as Modifiers
from snuba.utils.schemas import Array, Column, UInt


class Migration(migration.ClickhouseNodeMigration):

blocking = False
storage_set_key = StorageSetKey.EVENTS_ANALYTICS_PLATFORM
granularity = "8192"

local_table_name = "eap_trace_item_attrs_local"
dist_table_name = "eap_trace_item_attrs_dist"
mv_name = "eap_trace_item_attrs_mv"

str_hash_map_col = "_str_attr_keys_hashes"
float_hash_map_col = "_float64_attr_keys_hashes"

def forwards_ops(self) -> Sequence[operations.SqlOperation]:
return [
# --- Str attrs -----
operations.AddColumn(
storage_set=self.storage_set_key,
table_name=self.local_table_name,
column=Column(
name=self.str_hash_map_col,
type=Array(
UInt(64),
Modifiers(
materialized=get_array_vals_hash("mapKeys(attrs_string)")
),
),
),
after="attrs_string",
target=operations.OperationTarget.LOCAL,
),
operations.AddColumn(
storage_set=self.storage_set_key,
table_name=self.dist_table_name,
column=Column(
self.str_hash_map_col,
type=Array(
UInt(64),
Modifiers(
materialized=get_array_vals_hash("mapKeys(attrs_string)")
),
),
),
after="attrs_string",
target=operations.OperationTarget.DISTRIBUTED,
),
operations.AddIndex(
storage_set=self.storage_set_key,
table_name=self.local_table_name,
index_name=f"bf_{self.str_hash_map_col}",
index_expression=self.str_hash_map_col,
index_type="bloom_filter",
granularity=1,
target=operations.OperationTarget.LOCAL,
),
# --- Num attrs -----
operations.AddColumn(
storage_set=self.storage_set_key,
table_name=self.local_table_name,
column=Column(
name=self.float_hash_map_col,
type=Array(
UInt(64),
Modifiers(materialized=get_array_vals_hash("attrs_float64")),
),
),
after="attrs_float64",
target=operations.OperationTarget.LOCAL,
),
operations.AddColumn(
storage_set=self.storage_set_key,
table_name=self.dist_table_name,
column=Column(
self.float_hash_map_col,
type=Array(
UInt(64),
Modifiers(materialized=get_array_vals_hash("attrs_float64")),
),
),
after="attrs_float64",
target=operations.OperationTarget.DISTRIBUTED,
),
operations.AddIndex(
storage_set=self.storage_set_key,
table_name=self.local_table_name,
index_name=f"bf_{self.float_hash_map_col}",
index_expression=self.float_hash_map_col,
index_type="bloom_filter",
granularity=1,
target=operations.OperationTarget.LOCAL,
),
]

def backwards_ops(self) -> Sequence[operations.SqlOperation]:
return [
# --- Str attrs -----
operations.DropColumn(
storage_set=self.storage_set_key,
table_name=self.dist_table_name,
column_name=self.str_hash_map_col,
target=operations.OperationTarget.DISTRIBUTED,
),
operations.DropIndex(
storage_set=self.storage_set_key,
table_name=self.local_table_name,
index_name=f"bf_{self.str_hash_map_col}",
target=operations.OperationTarget.LOCAL,
),
operations.DropColumn(
storage_set=self.storage_set_key,
table_name=self.local_table_name,
column_name=self.str_hash_map_col,
target=operations.OperationTarget.LOCAL,
),
# --- Num attrs -----
operations.DropColumn(
storage_set=self.storage_set_key,
table_name=self.dist_table_name,
column_name=self.float_hash_map_col,
target=operations.OperationTarget.DISTRIBUTED,
),
operations.DropIndex(
storage_set=self.storage_set_key,
table_name=self.local_table_name,
index_name=f"bf_{self.float_hash_map_col}",
target=operations.OperationTarget.LOCAL,
),
operations.DropColumn(
storage_set=self.storage_set_key,
table_name=self.local_table_name,
column_name=self.float_hash_map_col,
target=operations.OperationTarget.LOCAL,
),
]

0 comments on commit 39530ad

Please sign in to comment.