Skip to content

feat(seer grouping): Call Seer before creating a new group #71026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 100 additions & 35 deletions src/sentry/event_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
record_hash_calculation_metrics,
record_new_group_metrics,
)
from sentry.grouping.ingest.seer import get_seer_similar_issues, should_call_seer_for_grouping
from sentry.grouping.ingest.utils import (
add_group_id_to_grouphashes,
check_for_category_mismatch,
Expand Down Expand Up @@ -136,6 +137,7 @@
from sentry.utils.safe import get_path, safe_execute, setdefault_path, trim
from sentry.utils.sdk import set_measurement
from sentry.utils.tag_normalization import normalized_sdk_tag_from_event
from sentry.utils.types import NonNone

if TYPE_CHECKING:
from sentry.eventstore.models import BaseEvent, Event
Expand Down Expand Up @@ -1436,6 +1438,15 @@ def _save_aggregate(
):
# These values will get overridden with whatever happens inside the lock if we do manage
# to acquire it, so it should only end up with `wait-for-lock` if we don't
#
# TODO: If we're using this `outome` value for anything more than a count in DD (in
# other words, if we care about duration), we should probably update it so that when an
# event does have to wait, we record whether during its wait the event which got the
# lock first
# a) created a new group without consulting Seer,
# b) created a new group because Seer didn't find a close enough match, or
# c) used an existing group found by Seer
# because which of those things happened will have an effect on how long the event had to wait.
span.set_tag("outcome", "wait_for_lock")
metric_tags["outcome"] = "wait_for_lock"

Expand All @@ -1444,25 +1455,29 @@ def _save_aggregate(
all_grouphash_ids.append(root_hierarchical_grouphash.id)

# If we're in this branch, we checked our grouphashes and didn't find one with a group
# attached. We thus want to create a new group, but we need to guard against another
# event with the same hash coming in before we're done here and also thinking it needs
# to create a new group. To prevent this, we're using double-checked locking
# attached. We thus want to either ask seer for a nearest neighbor group (and create a
# new group if one isn't found) or just create a new group without consulting seer, but
# either way we need to guard against another event with the same hash coming in before
# we're done here and also thinking it needs to talk to seer and/or create a new group.
# To prevent this, we're using double-checked locking
# (https://en.wikipedia.org/wiki/Double-checked_locking).

# First, try to lock the relevant rows in the `GroupHash` table. If another (identically
# hashed) event is also in the process of creating a group and has grabbed the lock
# before us, we'll block here until it's done. If not, we've now got the lock and other
# identically-hashed events will have to wait for us.
# hashed) event is already in the process of talking to seer and/or creating a group and
# has grabbed the lock before us, we'll block here until it's done. If not, we've now
# got the lock and other identically-hashed events will have to wait for us.
all_grouphashes = list(
GroupHash.objects.filter(id__in=all_grouphash_ids).select_for_update()
)

flat_grouphashes = [gh for gh in all_grouphashes if gh.hash in hashes.hashes]

# Now check again to see if any of our grouphashes have a group. In the first race
# condition scenario above, we'll have been blocked long enough for the other event to
# have created the group and updated our grouphashes with a group id, which means this
# time, we'll find something.
# Now check again to see if any of our grouphashes have a group. If we got the lock, the
# result won't have changed and we still won't find anything. If we didn't get it, we'll
# have blocked until whichever identically-hashed event *did* get the lock has either
# created a new group for our hashes or assigned them to a neighboring group suggessted
# by seer. If that happens, we'll skip this whole branch and jump down to the same one
# we would have landed in had we found a group to begin with.
existing_grouphash, root_hierarchical_hash = find_existing_grouphash(
project, flat_grouphashes, hashes.hierarchical_hashes
)
Expand All @@ -1474,10 +1489,34 @@ def _save_aggregate(
else:
root_hierarchical_grouphash = None

# If we still haven't found a matching grouphash, we're now safe to go ahead and create
# the group.
# If we still haven't found a matching grouphash, we're now safe to go ahead and talk to
# seer and/or create the group.
if existing_grouphash is None:
group = _create_group(project, event, **group_creation_kwargs)
seer_matched_group = None

if should_call_seer_for_grouping(event, project):
try:
# If the `projects:similarity-embeddings-grouping` feature is disabled,
# we'll still get back result metadata, but `seer_matched_group` will be None
seer_response_data, seer_matched_group = get_seer_similar_issues(
event, primary_hashes
)
event.data["seer_similarity"] = seer_response_data

# We only want to add this data to new groups, while we're testing
# TODO: Remove this once we're out of the testing phase
Comment on lines +1506 to +1507
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't we want to keep that for debugging even after testing phase?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might end up keeping it, but the current plan is to take it out eventually. For debugging purposes, we have the same information on the event hopefully added to the corresponding GroupHash record. (See #70454.)

if not seer_matched_group:
group_creation_kwargs["data"]["metadata"][
"seer_similarity"
] = seer_response_data

# Insurance - in theory we shouldn't ever land here
except Exception as e:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we know what exception types we're catching here? or is this meant to be a generic catch so that we don't break anything and lose events?

Copy link
Member Author

@lobsterkatie lobsterkatie May 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Much more the latter. Any reasonably-anticipatable errors in that logic are caught before they bubble up this high, but the last thing I want to do is break ingestion, so it seemed wise to try-catch it just in case. (But yeah, I can add a comment, because it's a reasonable question. [UPDATE: Done.])

sentry_sdk.capture_exception(
e, tags={"event": event.event_id, "project": project.id}
)

group = seer_matched_group or _create_group(project, event, **group_creation_kwargs)

if root_hierarchical_grouphash is not None:
new_hashes = [root_hierarchical_grouphash]
Expand All @@ -1488,41 +1527,64 @@ def _save_aggregate(
state=GroupHash.State.LOCKED_IN_MIGRATION
).update(group=group)

is_new = True
is_regression = False
is_new = not seer_matched_group
is_regression = (
False
if is_new
else _process_existing_aggregate(
# If `seer_matched_group` were `None`, `is_new` would be true and we
# wouldn't be here
group=NonNone(seer_matched_group),
event=event,
incoming_group_values=group_creation_kwargs,
release=release,
)
)

span.set_tag("outcome", "new_group")
metric_tags["outcome"] = "new_group"
span.set_tag("outcome", "new_group" if is_new else "seer_match")
metric_tags["outcome"] = "new_group" if is_new else "seer_match"
record_calculation_metric_with_result(
project=project,
has_secondary_hashes=has_secondary_hashes,
result="no_match",
)

metrics.incr(
"group.created",
skip_internal=True,
tags={
"platform": event.platform or "unknown",
"sdk": normalized_sdk_tag_from_event(event),
},
)

# This only applies to events with stacktraces
frame_mix = event.get_event_metadata().get("in_app_frame_mix")
if frame_mix:
if is_new:
metrics.incr(
"grouping.in_app_frame_mix",
sample_rate=1.0,
"group.created",
skip_internal=True,
tags={
"platform": event.platform or "unknown",
"sdk": normalized_sdk_tag_from_event(event),
"frame_mix": frame_mix,
},
)

# This only applies to events with stacktraces, and we only do this for new
# groups, because we assume that if Seer puts an event in an existing group, it
# and the existing group have the same frame mix
frame_mix = event.get_event_metadata().get("in_app_frame_mix")
if frame_mix:
metrics.incr(
"grouping.in_app_frame_mix",
sample_rate=1.0,
tags={
"platform": event.platform or "unknown",
"sdk": normalized_sdk_tag_from_event(event),
"frame_mix": frame_mix,
},
)

return GroupInfo(group, is_new, is_regression)

# If we land here, it's because either:
#
# a) There's an existing group with one of our hashes and we found it the first time we looked.
#
# b) We didn't find a group the first time we looked, but another identically-hashed event beat
# us to the lock and while we were waiting either created a new group or assigned our hashes to
# a neighboring group suggested by seer - such that when we finally got the lock and looked
# again, this time there was a group to find.

group = Group.objects.get(id=existing_grouphash.group_id)
if group.issue_category != GroupCategory.ERROR:
logger.info(
Expand Down Expand Up @@ -1559,9 +1621,10 @@ def _save_aggregate(
record_calculation_metric_with_result(
project=project,
has_secondary_hashes=has_secondary_hashes,
# If at least one primary hash value isn't new, then we will have found it, since we check
# those before the secondary hash values. If the primary hash values are all new, then we
# must have found a secondary hash (or we'd be in the group-creation branch).
# If at least one primary hash value isn't new, then we'll definitely have found it, since
# we check all of the primary hashes before any secondary ones. If the primary hash values
# *are* all new, then we must have gotten here by finding a secondary hash (or we'd be in
# the group-creation/seer-consultation branch).
result="found_primary" if not all_primary_hashes_are_new else "found_secondary",
)

Expand Down Expand Up @@ -1604,6 +1667,8 @@ def _save_aggregate(
return GroupInfo(group, is_new, is_regression)


# TODO: None of the seer logic has been added to this version yet, so you can't simultaneously use
# optimized transitions and seer
def _save_aggregate_new(
event: Event,
job: Job,
Expand Down
Loading
Loading