Skip to content

Commit 38d3a32

Browse files
authored
Send both process level and cpu level metrics (#35753)
1 parent 47374d3 commit 38d3a32

File tree

6 files changed

+131
-32
lines changed

6 files changed

+131
-32
lines changed

sdk/monitor/azure-monitor-opentelemetry-exporter/CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
- Update live metrics to use typespec generated swagger
1717
([#34840](https://github.com/Azure/azure-sdk-for-python/pull/34840))
18+
- Send old and new process level live metrics
19+
([#35753](https://github.com/Azure/azure-sdk-for-python/pull/35753))
1820

1921
## 1.0.0b25 (2024-04-19)
2022

sdk/monitor/azure-monitor-opentelemetry-exporter/azure/monitor/opentelemetry/exporter/_quickpulse/_constants.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
# (OpenTelemetry metric name, Quickpulse metric name)
88
# Memory
99
_COMMITTED_BYTES_NAME = ("azuremonitor.memorycommittedbytes", "\\Memory\\Committed Bytes")
10+
_PROCESS_PHYSICAL_BYTES_NAME = ("azuremonitor.processphysicalbytes", "\\Process\\Physical Bytes")
1011
# CPU
1112
_PROCESSOR_TIME_NAME = ("azuremonitor.processortotalprocessortime", "\\Processor(_Total)\\% Processor Time")
13+
_PROCESS_TIME_NORMALIZED_NAME = ("azuremonitor.processtimenormalized", "\\% Process\\Processor Time Normalized")
1214
# Request
1315
_REQUEST_RATE_NAME = ("azuremonitor.requestssec", "\\ApplicationInsights\\Requests/Sec")
1416
_REQUEST_FAILURE_RATE_NAME = ("azuremonitor.requestsfailedsec", "\\ApplicationInsights\\Requests Failed/Sec")
@@ -23,8 +25,9 @@
2325
_QUICKPULSE_METRIC_NAME_MAPPINGS = dict(
2426
[
2527
_COMMITTED_BYTES_NAME,
28+
_PROCESS_PHYSICAL_BYTES_NAME,
2629
_PROCESSOR_TIME_NAME,
27-
_PROCESSOR_TIME_NAME,
30+
_PROCESS_TIME_NORMALIZED_NAME,
2831
_REQUEST_RATE_NAME,
2932
_REQUEST_FAILURE_RATE_NAME,
3033
_REQUEST_DURATION_NAME,

sdk/monitor/azure-monitor-opentelemetry-exporter/azure/monitor/opentelemetry/exporter/_quickpulse/_live_metrics.py

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Copyright (c) Microsoft Corporation. All rights reserved.
22
# Licensed under the MIT License.
33
# cSpell:disable
4-
4+
from datetime import datetime
55
from typing import Any, Iterable, Optional
66

77
import platform
@@ -23,6 +23,8 @@
2323
_DEPENDENCY_FAILURE_RATE_NAME,
2424
_DEPENDENCY_RATE_NAME,
2525
_EXCEPTION_RATE_NAME,
26+
_PROCESS_PHYSICAL_BYTES_NAME,
27+
_PROCESS_TIME_NORMALIZED_NAME,
2628
_PROCESSOR_TIME_NAME,
2729
_REQUEST_DURATION_NAME,
2830
_REQUEST_FAILURE_RATE_NAME,
@@ -37,7 +39,13 @@
3739
_QuickpulseState,
3840
_is_post_state,
3941
_append_quickpulse_document,
42+
_get_quickpulse_last_process_cpu,
43+
_get_quickpulse_last_process_time,
44+
_get_quickpulse_process_elapsed_time,
4045
_set_global_quickpulse_state,
46+
_set_quickpulse_last_process_cpu,
47+
_set_quickpulse_last_process_time,
48+
_set_quickpulse_process_elapsed_time,
4149
)
4250
from azure.monitor.opentelemetry.exporter._quickpulse._utils import (
4351
_get_log_record_document,
@@ -55,6 +63,7 @@
5563

5664

5765
PROCESS = psutil.Process()
66+
NUM_CPUS = psutil.cpu_count()
5867

5968
def enable_live_metrics(**kwargs: Any) -> None: # pylint: disable=C4758
6069
"""Live metrics entry point.
@@ -129,13 +138,21 @@ def __init__(self, connection_string: Optional[str], resource: Optional[Resource
129138
"exc/sec",
130139
"live metrics exception rate per second"
131140
)
132-
self._process_memory_gauge = self._meter.create_observable_gauge(
141+
self._process_memory_gauge_old = self._meter.create_observable_gauge(
133142
_COMMITTED_BYTES_NAME[0],
134143
[_get_process_memory],
135144
)
136-
self._processor_time_gauge = self._meter.create_observable_gauge(
145+
self._process_memory_gauge = self._meter.create_observable_gauge(
146+
_PROCESS_PHYSICAL_BYTES_NAME[0],
147+
[_get_process_memory],
148+
)
149+
self._process_time_gauge_old = self._meter.create_observable_gauge(
137150
_PROCESSOR_TIME_NAME[0],
138-
[_get_processor_time],
151+
[_get_process_time_normalized_old],
152+
)
153+
self._process_time_gauge = self._meter.create_observable_gauge(
154+
_PROCESS_TIME_NORMALIZED_NAME[0],
155+
[_get_process_time_normalized],
139156
)
140157

141158
def _record_span(self, span: ReadableSpan) -> None:
@@ -178,19 +195,40 @@ def _record_log_record(self, log_data: LogData) -> None:
178195

179196
# pylint: disable=unused-argument
180197
def _get_process_memory(options: CallbackOptions) -> Iterable[Observation]:
181-
# rss is non-swapped physical memory a process has used
182-
yield Observation(
183-
PROCESS.memory_info().rss,
184-
{},
185-
)
198+
memory = 0
199+
try:
200+
# rss is non-swapped physical memory a process has used
201+
memory = PROCESS.memory_info().rss
202+
except (psutil.NoSuchProcess, psutil.AccessDenied):
203+
pass
204+
yield Observation(memory, {})
205+
206+
207+
# pylint: disable=unused-argument
208+
def _get_process_time_normalized_old(options: CallbackOptions) -> Iterable[Observation]:
209+
normalized_cpu_percentage = 0.0
210+
try:
211+
cpu_times = PROCESS.cpu_times()
212+
# total process time is user + system in s
213+
total_time_s = cpu_times.user + cpu_times.system
214+
process_time_s = total_time_s - _get_quickpulse_last_process_time()
215+
_set_quickpulse_last_process_time(process_time_s)
216+
# Find elapsed time in s since last collection
217+
current_time = datetime.now()
218+
elapsed_time_s = (current_time - _get_quickpulse_process_elapsed_time()).total_seconds()
219+
_set_quickpulse_process_elapsed_time(current_time)
220+
# Obtain cpu % by dividing by elapsed time
221+
cpu_percentage = process_time_s / elapsed_time_s
222+
# Normalize by dividing by amount of logical cpus
223+
normalized_cpu_percentage = cpu_percentage / NUM_CPUS
224+
_set_quickpulse_last_process_cpu(normalized_cpu_percentage)
225+
except (psutil.NoSuchProcess, psutil.AccessDenied, ZeroDivisionError):
226+
pass
227+
yield Observation(normalized_cpu_percentage, {})
186228

187229

188230
# pylint: disable=unused-argument
189-
def _get_processor_time(options: CallbackOptions) -> Iterable[Observation]:
190-
# Processor time does not include idle time
191-
yield Observation(
192-
100 - psutil.cpu_times_percent().idle,
193-
{},
194-
)
231+
def _get_process_time_normalized(options: CallbackOptions) -> Iterable[Observation]:
232+
yield Observation(_get_quickpulse_last_process_cpu(), {})
195233

196234
# cSpell:enable

sdk/monitor/azure-monitor-opentelemetry-exporter/azure/monitor/opentelemetry/exporter/_quickpulse/_state.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Copyright (c) Microsoft Corporation. All rights reserved.
22
# Licensed under the MIT License.
3+
from datetime import datetime
34
from enum import Enum
45
from typing import List
56

@@ -23,8 +24,11 @@ class _QuickpulseState(Enum):
2324

2425
_GLOBAL_QUICKPULSE_STATE = _QuickpulseState.OFFLINE
2526
_QUICKPULSE_DOCUMENTS: List[DocumentIngress] = []
27+
_QUICKPULSE_LAST_PROCESS_TIME = 0.0
28+
_QUICKPULSE_PROCESS_ELAPSED_TIME = datetime.now()
29+
_QUICKPULSE_LAST_PROCESS_CPU = 0.0
2630

27-
def _set_global_quickpulse_state(state: _QuickpulseState):
31+
def _set_global_quickpulse_state(state: _QuickpulseState) -> None:
2832
# pylint: disable=global-statement
2933
global _GLOBAL_QUICKPULSE_STATE
3034
_GLOBAL_QUICKPULSE_STATE = state
@@ -34,6 +38,36 @@ def _get_global_quickpulse_state() -> _QuickpulseState:
3438
return _GLOBAL_QUICKPULSE_STATE
3539

3640

41+
def _set_quickpulse_last_process_time(time: float) -> None:
42+
# pylint: disable=global-statement
43+
global _QUICKPULSE_LAST_PROCESS_TIME
44+
_QUICKPULSE_LAST_PROCESS_TIME = time
45+
46+
47+
def _get_quickpulse_last_process_time() -> float:
48+
return _QUICKPULSE_LAST_PROCESS_TIME
49+
50+
51+
def _set_quickpulse_process_elapsed_time(time: datetime) -> None:
52+
# pylint: disable=global-statement
53+
global _QUICKPULSE_PROCESS_ELAPSED_TIME
54+
_QUICKPULSE_PROCESS_ELAPSED_TIME = time
55+
56+
57+
def _get_quickpulse_process_elapsed_time() -> datetime:
58+
return _QUICKPULSE_PROCESS_ELAPSED_TIME
59+
60+
61+
def _set_quickpulse_last_process_cpu(time: float) -> None:
62+
# pylint: disable=global-statement
63+
global _QUICKPULSE_LAST_PROCESS_CPU
64+
_QUICKPULSE_LAST_PROCESS_CPU = time
65+
66+
67+
def _get_quickpulse_last_process_cpu() -> float:
68+
return _QUICKPULSE_LAST_PROCESS_CPU
69+
70+
3771
def is_quickpulse_enabled() -> bool:
3872
return _get_global_quickpulse_state() is not _QuickpulseState.OFFLINE
3973

sdk/monitor/azure-monitor-opentelemetry-exporter/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@
8787
"msrest>=0.6.10",
8888
"opentelemetry-api~=1.21",
8989
"opentelemetry-sdk~=1.21",
90-
"psutil>=5.9.8",
90+
"psutil~=5.9",
9191
],
9292
entry_points={
9393
"opentelemetry_traces_exporter": [

sdk/monitor/azure-monitor-opentelemetry-exporter/tests/quickpulse/test_live_metrics.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
import collections
77
import platform
8+
import psutil
89
import unittest
10+
from datetime import datetime, timedelta
911
from unittest import mock
1012

1113
from opentelemetry.sdk.metrics import (
@@ -21,12 +23,12 @@
2123

2224
from azure.monitor.opentelemetry.exporter._generated.models import ContextTagKeys
2325
from azure.monitor.opentelemetry.exporter._quickpulse._constants import (
24-
_COMMITTED_BYTES_NAME,
2526
_DEPENDENCY_DURATION_NAME,
2627
_DEPENDENCY_FAILURE_RATE_NAME,
2728
_DEPENDENCY_RATE_NAME,
2829
_EXCEPTION_RATE_NAME,
29-
_PROCESSOR_TIME_NAME,
30+
_PROCESS_PHYSICAL_BYTES_NAME,
31+
_PROCESS_TIME_NORMALIZED_NAME,
3032
_REQUEST_DURATION_NAME,
3133
_REQUEST_FAILURE_RATE_NAME,
3234
_REQUEST_RATE_NAME,
@@ -38,7 +40,8 @@
3840
from azure.monitor.opentelemetry.exporter._quickpulse._live_metrics import (
3941
enable_live_metrics,
4042
_get_process_memory,
41-
_get_processor_time,
43+
_get_process_time_normalized,
44+
_get_process_time_normalized_old,
4245
_QuickpulseManager,
4346
)
4447
from azure.monitor.opentelemetry.exporter._quickpulse._state import (
@@ -65,6 +68,7 @@ def test_enable_live_metrics(self, manager_mock):
6568

6669

6770
class TestQuickpulseManager(unittest.TestCase):
71+
6872
@classmethod
6973
def setUpClass(cls):
7074
_set_global_quickpulse_state(_QuickpulseState.PING_SHORT)
@@ -131,11 +135,11 @@ def test_init(self, generator_mock):
131135
self.assertTrue(isinstance(qpm._exception_rate_counter, Counter))
132136
self.assertEqual(qpm._exception_rate_counter.name, _EXCEPTION_RATE_NAME[0])
133137
self.assertTrue(isinstance(qpm._process_memory_gauge, ObservableGauge))
134-
self.assertEqual(qpm._process_memory_gauge.name, _COMMITTED_BYTES_NAME[0])
138+
self.assertEqual(qpm._process_memory_gauge.name, _PROCESS_PHYSICAL_BYTES_NAME[0])
135139
self.assertEqual(qpm._process_memory_gauge._callbacks, [_get_process_memory])
136-
self.assertTrue(isinstance(qpm._processor_time_gauge, ObservableGauge))
137-
self.assertEqual(qpm._processor_time_gauge.name, _PROCESSOR_TIME_NAME[0])
138-
self.assertEqual(qpm._processor_time_gauge._callbacks, [_get_processor_time])
140+
self.assertTrue(isinstance(qpm._process_time_gauge, ObservableGauge))
141+
self.assertEqual(qpm._process_time_gauge.name, _PROCESS_TIME_NORMALIZED_NAME[0])
142+
self.assertEqual(qpm._process_time_gauge._callbacks, [_get_process_time_normalized])
139143

140144

141145
def test_singleton(self):
@@ -301,13 +305,31 @@ def test_process_memory(self):
301305
obs = next(mem)
302306
self.assertEqual(obs.value, 40)
303307

304-
@mock.patch("psutil.cpu_times_percent")
305-
def test_processor_time(self, processor_mock):
306-
cpu = collections.namedtuple('cpu', 'idle')
307-
cpu_times = cpu(idle=94.5)
308-
processor_mock.return_value = cpu_times
309-
time = _get_processor_time(None)
308+
def test_process_memory_error(self):
309+
with mock.patch("azure.monitor.opentelemetry.exporter._quickpulse._live_metrics.PROCESS") as process_mock:
310+
memory = collections.namedtuple('memory', 'rss')
311+
pmem = memory(rss=40)
312+
process_mock.memory_info.return_value = pmem
313+
process_mock.memory_info.side_effect = psutil.NoSuchProcess(1)
314+
mem = _get_process_memory(None)
315+
obs = next(mem)
316+
self.assertEqual(obs.value, 0)
317+
318+
@mock.patch("azure.monitor.opentelemetry.exporter._quickpulse._live_metrics._get_quickpulse_process_elapsed_time")
319+
@mock.patch("azure.monitor.opentelemetry.exporter._quickpulse._live_metrics._get_quickpulse_last_process_time")
320+
@mock.patch("azure.monitor.opentelemetry.exporter._quickpulse._live_metrics.PROCESS")
321+
def test_process_time(self, process_mock, process_time_mock, elapsed_time_mock):
322+
current = datetime.now()
323+
cpu = collections.namedtuple("cpu", ['user', 'system'])
324+
cpu_times = cpu(user=3.6, system=6.8)
325+
process_mock.cpu_times.return_value = cpu_times
326+
process_time_mock.return_value = 4.4
327+
elapsed_time_mock.return_value = current - timedelta(seconds=5)
328+
with mock.patch("datetime.datetime") as datetime_mock:
329+
datetime_mock.now.return_value = current
330+
time = _get_process_time_normalized_old(None)
310331
obs = next(time)
311-
self.assertEqual(obs.value, 5.5)
332+
num_cpus = psutil.cpu_count()
333+
self.assertAlmostEqual(obs.value, 1.2 / num_cpus, delta=1)
312334

313335
# cSpell:enable

0 commit comments

Comments
 (0)