-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
feat(seer grouping): Call Seer before creating a new group #71026
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7cfa181
e3f4021
c6caa2a
a4ad689
a60dc00
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,6 +68,7 @@ | |
record_hash_calculation_metrics, | ||
record_new_group_metrics, | ||
) | ||
from sentry.grouping.ingest.seer import get_seer_similar_issues, should_call_seer_for_grouping | ||
from sentry.grouping.ingest.utils import ( | ||
add_group_id_to_grouphashes, | ||
check_for_category_mismatch, | ||
|
@@ -136,6 +137,7 @@ | |
from sentry.utils.safe import get_path, safe_execute, setdefault_path, trim | ||
from sentry.utils.sdk import set_measurement | ||
from sentry.utils.tag_normalization import normalized_sdk_tag_from_event | ||
from sentry.utils.types import NonNone | ||
|
||
if TYPE_CHECKING: | ||
from sentry.eventstore.models import BaseEvent, Event | ||
|
@@ -1436,6 +1438,15 @@ def _save_aggregate( | |
): | ||
# These values will get overridden with whatever happens inside the lock if we do manage | ||
# to acquire it, so it should only end up with `wait-for-lock` if we don't | ||
# | ||
# TODO: If we're using this `outome` value for anything more than a count in DD (in | ||
# other words, if we care about duration), we should probably update it so that when an | ||
# event does have to wait, we record whether during its wait the event which got the | ||
# lock first | ||
# a) created a new group without consulting Seer, | ||
# b) created a new group because Seer didn't find a close enough match, or | ||
# c) used an existing group found by Seer | ||
# because which of those things happened will have an effect on how long the event had to wait. | ||
span.set_tag("outcome", "wait_for_lock") | ||
metric_tags["outcome"] = "wait_for_lock" | ||
|
||
|
@@ -1444,25 +1455,29 @@ def _save_aggregate( | |
all_grouphash_ids.append(root_hierarchical_grouphash.id) | ||
|
||
# If we're in this branch, we checked our grouphashes and didn't find one with a group | ||
# attached. We thus want to create a new group, but we need to guard against another | ||
# event with the same hash coming in before we're done here and also thinking it needs | ||
# to create a new group. To prevent this, we're using double-checked locking | ||
# attached. We thus want to either ask seer for a nearest neighbor group (and create a | ||
# new group if one isn't found) or just create a new group without consulting seer, but | ||
# either way we need to guard against another event with the same hash coming in before | ||
# we're done here and also thinking it needs to talk to seer and/or create a new group. | ||
# To prevent this, we're using double-checked locking | ||
# (https://en.wikipedia.org/wiki/Double-checked_locking). | ||
|
||
# First, try to lock the relevant rows in the `GroupHash` table. If another (identically | ||
# hashed) event is also in the process of creating a group and has grabbed the lock | ||
# before us, we'll block here until it's done. If not, we've now got the lock and other | ||
# identically-hashed events will have to wait for us. | ||
# hashed) event is already in the process of talking to seer and/or creating a group and | ||
# has grabbed the lock before us, we'll block here until it's done. If not, we've now | ||
# got the lock and other identically-hashed events will have to wait for us. | ||
all_grouphashes = list( | ||
GroupHash.objects.filter(id__in=all_grouphash_ids).select_for_update() | ||
) | ||
|
||
flat_grouphashes = [gh for gh in all_grouphashes if gh.hash in hashes.hashes] | ||
|
||
# Now check again to see if any of our grouphashes have a group. In the first race | ||
# condition scenario above, we'll have been blocked long enough for the other event to | ||
# have created the group and updated our grouphashes with a group id, which means this | ||
# time, we'll find something. | ||
# Now check again to see if any of our grouphashes have a group. If we got the lock, the | ||
# result won't have changed and we still won't find anything. If we didn't get it, we'll | ||
# have blocked until whichever identically-hashed event *did* get the lock has either | ||
# created a new group for our hashes or assigned them to a neighboring group suggessted | ||
# by seer. If that happens, we'll skip this whole branch and jump down to the same one | ||
# we would have landed in had we found a group to begin with. | ||
existing_grouphash, root_hierarchical_hash = find_existing_grouphash( | ||
project, flat_grouphashes, hashes.hierarchical_hashes | ||
) | ||
|
@@ -1474,10 +1489,34 @@ def _save_aggregate( | |
else: | ||
root_hierarchical_grouphash = None | ||
|
||
# If we still haven't found a matching grouphash, we're now safe to go ahead and create | ||
# the group. | ||
# If we still haven't found a matching grouphash, we're now safe to go ahead and talk to | ||
# seer and/or create the group. | ||
if existing_grouphash is None: | ||
group = _create_group(project, event, **group_creation_kwargs) | ||
seer_matched_group = None | ||
|
||
if should_call_seer_for_grouping(event, project): | ||
try: | ||
# If the `projects:similarity-embeddings-grouping` feature is disabled, | ||
# we'll still get back result metadata, but `seer_matched_group` will be None | ||
seer_response_data, seer_matched_group = get_seer_similar_issues( | ||
event, primary_hashes | ||
) | ||
event.data["seer_similarity"] = seer_response_data | ||
|
||
# We only want to add this data to new groups, while we're testing | ||
# TODO: Remove this once we're out of the testing phase | ||
if not seer_matched_group: | ||
group_creation_kwargs["data"]["metadata"][ | ||
"seer_similarity" | ||
] = seer_response_data | ||
|
||
# Insurance - in theory we shouldn't ever land here | ||
except Exception as e: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we know what exception types we're catching here? or is this meant to be a generic catch so that we don't break anything and lose events? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Much more the latter. Any reasonably-anticipatable errors in that logic are caught before they bubble up this high, but the last thing I want to do is break ingestion, so it seemed wise to try-catch it just in case. (But yeah, I can add a comment, because it's a reasonable question. [UPDATE: Done.]) |
||
sentry_sdk.capture_exception( | ||
e, tags={"event": event.event_id, "project": project.id} | ||
) | ||
|
||
group = seer_matched_group or _create_group(project, event, **group_creation_kwargs) | ||
|
||
if root_hierarchical_grouphash is not None: | ||
new_hashes = [root_hierarchical_grouphash] | ||
|
@@ -1488,41 +1527,64 @@ def _save_aggregate( | |
state=GroupHash.State.LOCKED_IN_MIGRATION | ||
).update(group=group) | ||
|
||
is_new = True | ||
is_regression = False | ||
is_new = not seer_matched_group | ||
is_regression = ( | ||
False | ||
if is_new | ||
else _process_existing_aggregate( | ||
# If `seer_matched_group` were `None`, `is_new` would be true and we | ||
# wouldn't be here | ||
group=NonNone(seer_matched_group), | ||
event=event, | ||
incoming_group_values=group_creation_kwargs, | ||
release=release, | ||
) | ||
) | ||
|
||
span.set_tag("outcome", "new_group") | ||
metric_tags["outcome"] = "new_group" | ||
span.set_tag("outcome", "new_group" if is_new else "seer_match") | ||
metric_tags["outcome"] = "new_group" if is_new else "seer_match" | ||
record_calculation_metric_with_result( | ||
project=project, | ||
has_secondary_hashes=has_secondary_hashes, | ||
result="no_match", | ||
) | ||
|
||
metrics.incr( | ||
"group.created", | ||
skip_internal=True, | ||
tags={ | ||
"platform": event.platform or "unknown", | ||
"sdk": normalized_sdk_tag_from_event(event), | ||
}, | ||
) | ||
|
||
# This only applies to events with stacktraces | ||
frame_mix = event.get_event_metadata().get("in_app_frame_mix") | ||
if frame_mix: | ||
if is_new: | ||
metrics.incr( | ||
"grouping.in_app_frame_mix", | ||
sample_rate=1.0, | ||
"group.created", | ||
skip_internal=True, | ||
tags={ | ||
"platform": event.platform or "unknown", | ||
"sdk": normalized_sdk_tag_from_event(event), | ||
"frame_mix": frame_mix, | ||
}, | ||
) | ||
|
||
# This only applies to events with stacktraces, and we only do this for new | ||
# groups, because we assume that if Seer puts an event in an existing group, it | ||
# and the existing group have the same frame mix | ||
frame_mix = event.get_event_metadata().get("in_app_frame_mix") | ||
if frame_mix: | ||
metrics.incr( | ||
"grouping.in_app_frame_mix", | ||
sample_rate=1.0, | ||
tags={ | ||
"platform": event.platform or "unknown", | ||
"sdk": normalized_sdk_tag_from_event(event), | ||
"frame_mix": frame_mix, | ||
}, | ||
) | ||
|
||
return GroupInfo(group, is_new, is_regression) | ||
|
||
# If we land here, it's because either: | ||
# | ||
# a) There's an existing group with one of our hashes and we found it the first time we looked. | ||
# | ||
# b) We didn't find a group the first time we looked, but another identically-hashed event beat | ||
# us to the lock and while we were waiting either created a new group or assigned our hashes to | ||
# a neighboring group suggested by seer - such that when we finally got the lock and looked | ||
# again, this time there was a group to find. | ||
|
||
group = Group.objects.get(id=existing_grouphash.group_id) | ||
if group.issue_category != GroupCategory.ERROR: | ||
logger.info( | ||
|
@@ -1559,9 +1621,10 @@ def _save_aggregate( | |
record_calculation_metric_with_result( | ||
project=project, | ||
has_secondary_hashes=has_secondary_hashes, | ||
# If at least one primary hash value isn't new, then we will have found it, since we check | ||
# those before the secondary hash values. If the primary hash values are all new, then we | ||
# must have found a secondary hash (or we'd be in the group-creation branch). | ||
# If at least one primary hash value isn't new, then we'll definitely have found it, since | ||
# we check all of the primary hashes before any secondary ones. If the primary hash values | ||
# *are* all new, then we must have gotten here by finding a secondary hash (or we'd be in | ||
# the group-creation/seer-consultation branch). | ||
result="found_primary" if not all_primary_hashes_are_new else "found_secondary", | ||
) | ||
|
||
|
@@ -1604,6 +1667,8 @@ def _save_aggregate( | |
return GroupInfo(group, is_new, is_regression) | ||
|
||
|
||
# TODO: None of the seer logic has been added to this version yet, so you can't simultaneously use | ||
# optimized transitions and seer | ||
def _save_aggregate_new( | ||
event: Event, | ||
job: Job, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't we want to keep that for debugging even after testing phase?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might end up keeping it, but the current plan is to take it out eventually. For debugging purposes, we have the same information
on the eventhopefully added to the correspondingGroupHash
record. (See #70454.)