set up metrics collecting with open telemetry (#11236)
Done : - move metrics and health cache services from health module to metrics module - refactor metrics counter from specific method to set up from enum keys - add OpenTelemetry (Otel) instrumentation for metrics - set up Otel SDK to send metrics to Otel collector To do later : - implement Otel instrumentation for traces + plug Sentry on top
This commit is contained in:
@ -0,0 +1,31 @@
|
||||
import { MetricsKeys } from 'src/engine/core-modules/metrics/types/metrics-keys.type';
|
||||
|
||||
export const MESSAGE_SYNC_METRICS_BY_STATUS = [
|
||||
{
|
||||
name: 'ACTIVE',
|
||||
cacheKey: MetricsKeys.MessageChannelSyncJobActive,
|
||||
},
|
||||
{
|
||||
name: 'FAILED_UNKNOWN',
|
||||
cacheKey: MetricsKeys.MessageChannelSyncJobFailedUnknown,
|
||||
},
|
||||
{
|
||||
name: 'FAILED_INSUFFICIENT_PERMISSIONS',
|
||||
cacheKey: MetricsKeys.MessageChannelSyncJobFailedInsufficientPermissions,
|
||||
},
|
||||
];
|
||||
|
||||
export const CALENDAR_SYNC_METRICS_BY_STATUS = [
|
||||
{
|
||||
name: 'ACTIVE',
|
||||
cacheKey: MetricsKeys.CalendarEventSyncJobActive,
|
||||
},
|
||||
{
|
||||
name: 'FAILED_UNKNOWN',
|
||||
cacheKey: MetricsKeys.CalendarEventSyncJobFailedUnknown,
|
||||
},
|
||||
{
|
||||
name: 'FAILED_INSUFFICIENT_PERMISSIONS',
|
||||
cacheKey: MetricsKeys.CalendarEventSyncJobFailedInsufficientPermissions,
|
||||
},
|
||||
];
|
||||
@ -0,0 +1,94 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
|
||||
import { InjectCacheStorage } from 'src/engine/core-modules/cache-storage/decorators/cache-storage.decorator';
|
||||
import { CacheStorageService } from 'src/engine/core-modules/cache-storage/services/cache-storage.service';
|
||||
import { CacheStorageNamespace } from 'src/engine/core-modules/cache-storage/types/cache-storage-namespace.enum';
|
||||
import { EnvironmentService } from 'src/engine/core-modules/environment/environment.service';
|
||||
import { MetricsKeys } from 'src/engine/core-modules/metrics/types/metrics-keys.type';
|
||||
|
||||
const CACHE_BUCKET_DURATION_MS = 15000; // 15 seconds window for each cache bucket
|
||||
|
||||
@Injectable()
|
||||
export class MetricsCacheService {
|
||||
private readonly healthMetricsTimeWindowInMinutes: number;
|
||||
private readonly healthCacheTtl: number;
|
||||
|
||||
constructor(
|
||||
@InjectCacheStorage(CacheStorageNamespace.EngineHealth)
|
||||
private readonly cacheStorage: CacheStorageService,
|
||||
private readonly environmentService: EnvironmentService,
|
||||
) {
|
||||
this.healthMetricsTimeWindowInMinutes = this.environmentService.get(
|
||||
'HEALTH_METRICS_TIME_WINDOW_IN_MINUTES',
|
||||
);
|
||||
this.healthCacheTtl = this.healthMetricsTimeWindowInMinutes * 60000 * 2;
|
||||
}
|
||||
|
||||
private getCacheBucketStartTimestamp(timestamp: number): number {
|
||||
return (
|
||||
Math.floor(timestamp / CACHE_BUCKET_DURATION_MS) *
|
||||
CACHE_BUCKET_DURATION_MS
|
||||
);
|
||||
}
|
||||
|
||||
private getCacheKeyWithTimestamp(key: string, timestamp?: number): string {
|
||||
const currentIntervalTimestamp =
|
||||
timestamp ?? this.getCacheBucketStartTimestamp(Date.now());
|
||||
|
||||
return `${key}:${currentIntervalTimestamp}`;
|
||||
}
|
||||
|
||||
private getLastCacheBucketStartTimestampsFromDate(
|
||||
cacheBucketsCount: number,
|
||||
date: number,
|
||||
): number[] {
|
||||
const currentIntervalTimestamp = this.getCacheBucketStartTimestamp(date);
|
||||
|
||||
return Array.from(
|
||||
{ length: cacheBucketsCount },
|
||||
(_, i) => currentIntervalTimestamp - i * CACHE_BUCKET_DURATION_MS,
|
||||
);
|
||||
}
|
||||
|
||||
async updateCounter(key: MetricsKeys, items: string[]) {
|
||||
return await this.cacheStorage.setAdd(
|
||||
this.getCacheKeyWithTimestamp(key),
|
||||
items,
|
||||
this.healthCacheTtl,
|
||||
);
|
||||
}
|
||||
|
||||
async computeCount({
|
||||
key,
|
||||
timeWindowInSeconds = this.healthMetricsTimeWindowInMinutes * 60,
|
||||
date = Date.now(),
|
||||
}: {
|
||||
key: MetricsKeys;
|
||||
timeWindowInSeconds?: number;
|
||||
date?: number;
|
||||
}): Promise<number> {
|
||||
if ((timeWindowInSeconds * 1000) % CACHE_BUCKET_DURATION_MS !== 0) {
|
||||
throw new Error(
|
||||
`Time window must be divisible by ${CACHE_BUCKET_DURATION_MS}`,
|
||||
);
|
||||
}
|
||||
|
||||
const cacheBuckets =
|
||||
timeWindowInSeconds / (CACHE_BUCKET_DURATION_MS / 1000);
|
||||
|
||||
const cacheKeys = this.computeTimeStampedCacheKeys(key, cacheBuckets, date);
|
||||
|
||||
return await this.cacheStorage.countAllSetMembers(cacheKeys);
|
||||
}
|
||||
|
||||
computeTimeStampedCacheKeys(
|
||||
key: string,
|
||||
cacheBucketsCount: number,
|
||||
date: number,
|
||||
) {
|
||||
return this.getLastCacheBucketStartTimestampsFromDate(
|
||||
cacheBucketsCount,
|
||||
date,
|
||||
).map((timestamp) => this.getCacheKeyWithTimestamp(key, timestamp));
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,10 @@
|
||||
import { Module } from '@nestjs/common';
|
||||
|
||||
import { MetricsCacheService } from 'src/engine/core-modules/metrics/metrics-cache.service';
|
||||
import { MetricsService } from 'src/engine/core-modules/metrics/metrics.service';
|
||||
|
||||
@Module({
|
||||
providers: [MetricsService, MetricsCacheService],
|
||||
exports: [MetricsService, MetricsCacheService],
|
||||
})
|
||||
export class MetricsModule {}
|
||||
@ -0,0 +1,70 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
|
||||
import { metrics } from '@opentelemetry/api';
|
||||
|
||||
import { MetricsCacheService } from 'src/engine/core-modules/metrics/metrics-cache.service';
|
||||
import { MetricsKeys } from 'src/engine/core-modules/metrics/types/metrics-keys.type';
|
||||
|
||||
@Injectable()
|
||||
export class MetricsService {
|
||||
constructor(private readonly metricsCacheService: MetricsCacheService) {}
|
||||
|
||||
async incrementCounter({
|
||||
key,
|
||||
eventId,
|
||||
shouldStoreInCache = true,
|
||||
}: {
|
||||
key: MetricsKeys;
|
||||
eventId: string;
|
||||
shouldStoreInCache?: boolean;
|
||||
}) {
|
||||
//TODO : Define meter name usage in monitoring
|
||||
const meter = metrics.getMeter('twenty-server');
|
||||
const counter = meter.createCounter(key);
|
||||
|
||||
counter.add(1);
|
||||
|
||||
if (shouldStoreInCache) {
|
||||
this.metricsCacheService.updateCounter(key, [eventId]);
|
||||
}
|
||||
}
|
||||
|
||||
async batchIncrementCounter({
|
||||
key,
|
||||
eventIds,
|
||||
shouldStoreInCache = true,
|
||||
}: {
|
||||
key: MetricsKeys;
|
||||
eventIds: string[];
|
||||
shouldStoreInCache?: boolean;
|
||||
}) {
|
||||
//TODO : Define meter name usage in monitoring
|
||||
const meter = metrics.getMeter('twenty-server');
|
||||
const counter = meter.createCounter(key);
|
||||
|
||||
counter.add(eventIds.length);
|
||||
|
||||
if (shouldStoreInCache) {
|
||||
this.metricsCacheService.updateCounter(key, eventIds);
|
||||
}
|
||||
}
|
||||
|
||||
async groupMetrics(
|
||||
metrics: { name: string; cacheKey: MetricsKeys }[],
|
||||
): Promise<Record<string, number>> {
|
||||
const groupedMetrics: Record<string, number> = {};
|
||||
|
||||
const date = Date.now();
|
||||
|
||||
for (const metric of metrics) {
|
||||
const metricValue = await this.metricsCacheService.computeCount({
|
||||
key: metric.cacheKey,
|
||||
date,
|
||||
});
|
||||
|
||||
groupedMetrics[metric.name] = metricValue;
|
||||
}
|
||||
|
||||
return groupedMetrics;
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,4 @@
|
||||
export enum MeterDriver {
|
||||
OpenTelemetry = 'opentelemetry',
|
||||
Console = 'console',
|
||||
}
|
||||
@ -0,0 +1,9 @@
|
||||
export enum MetricsKeys {
|
||||
MessageChannelSyncJobActive = 'message-channel-sync-job/active',
|
||||
MessageChannelSyncJobFailedInsufficientPermissions = 'message-channel-sync-job/failed-insufficient-permissions',
|
||||
MessageChannelSyncJobFailedUnknown = 'message-channel-sync-job/failed-unknown',
|
||||
CalendarEventSyncJobActive = 'calendar-event-sync-job/active',
|
||||
CalendarEventSyncJobFailedInsufficientPermissions = 'calendar-event-sync-job/failed-insufficient-permissions',
|
||||
CalendarEventSyncJobFailedUnknown = 'calendar-event-sync-job/failed-unknown',
|
||||
InvalidCaptcha = 'invalid-captcha',
|
||||
}
|
||||
Reference in New Issue
Block a user