Health status worker metrics improvements (#10442)

Co-authored-by: Félix Malfait <felix.malfait@gmail.com>
This commit is contained in:
nitin
2025-03-04 12:47:12 +05:30
committed by GitHub
parent 41db10daff
commit 327f0cd370
27 changed files with 1468 additions and 312 deletions

View File

@ -1,15 +1,24 @@
import { Test, TestingModule } from '@nestjs/testing';
import { Queue } from 'bullmq';
import { Redis } from 'ioredis';
import { AdminPanelHealthService } from 'src/engine/core-modules/admin-panel/admin-panel-health.service';
import { HEALTH_INDICATORS } from 'src/engine/core-modules/admin-panel/constants/health-indicators.constants';
import { SystemHealth } from 'src/engine/core-modules/admin-panel/dtos/system-health.dto';
import { AdminPanelHealthServiceStatus } from 'src/engine/core-modules/admin-panel/enums/admin-panel-health-service-status.enum';
import { QueueMetricsTimeRange } from 'src/engine/core-modules/admin-panel/enums/queue-metrics-time-range.enum';
import { EnvironmentService } from 'src/engine/core-modules/environment/environment.service';
import { HEALTH_ERROR_MESSAGES } from 'src/engine/core-modules/health/constants/health-error-messages.constants';
import { HealthIndicatorId } from 'src/engine/core-modules/health/enums/health-indicator-id.enum';
import { ConnectedAccountHealth } from 'src/engine/core-modules/health/indicators/connected-account.health';
import { DatabaseHealthIndicator } from 'src/engine/core-modules/health/indicators/database.health';
import { RedisHealthIndicator } from 'src/engine/core-modules/health/indicators/redis.health';
import { WorkerHealthIndicator } from 'src/engine/core-modules/health/indicators/worker.health';
import { MessageQueue } from 'src/engine/core-modules/message-queue/message-queue.constants';
import { RedisClientService } from 'src/engine/core-modules/redis-client/redis-client.service';
jest.mock('bullmq');
describe('AdminPanelHealthService', () => {
let service: AdminPanelHealthService;
@ -17,12 +26,25 @@ describe('AdminPanelHealthService', () => {
let redisHealth: jest.Mocked<RedisHealthIndicator>;
let workerHealth: jest.Mocked<WorkerHealthIndicator>;
let connectedAccountHealth: jest.Mocked<ConnectedAccountHealth>;
let redisClient: jest.Mocked<RedisClientService>;
let environmentService: jest.Mocked<EnvironmentService>;
let loggerSpy: jest.SpyInstance;
beforeEach(async () => {
databaseHealth = { isHealthy: jest.fn() } as any;
redisHealth = { isHealthy: jest.fn() } as any;
workerHealth = { isHealthy: jest.fn() } as any;
workerHealth = { isHealthy: jest.fn(), getQueueDetails: jest.fn() } as any;
connectedAccountHealth = { isHealthy: jest.fn() } as any;
redisClient = {
getClient: jest.fn().mockReturnValue({} as Redis),
} as any;
environmentService = { get: jest.fn() } as any;
(Queue as unknown as jest.Mock) = jest.fn().mockImplementation(() => ({
getMetrics: jest.fn(),
getWorkers: jest.fn(),
close: jest.fn(),
}));
const module: TestingModule = await Test.createTestingModule({
providers: [
@ -31,10 +53,21 @@ describe('AdminPanelHealthService', () => {
{ provide: RedisHealthIndicator, useValue: redisHealth },
{ provide: WorkerHealthIndicator, useValue: workerHealth },
{ provide: ConnectedAccountHealth, useValue: connectedAccountHealth },
{ provide: RedisClientService, useValue: redisClient },
{ provide: EnvironmentService, useValue: environmentService },
],
}).compile();
service = module.get<AdminPanelHealthService>(AdminPanelHealthService);
loggerSpy = jest
.spyOn(service['logger'], 'error')
.mockImplementation(() => {});
});
afterEach(() => {
jest.clearAllMocks();
loggerSpy.mockRestore();
});
it('should be defined', () => {
@ -62,8 +95,9 @@ describe('AdminPanelHealthService', () => {
delayed: 4,
failed: 3,
waiting: 0,
prioritized: 0,
failureRate: 0.3,
},
status: 'up',
},
],
},
@ -209,26 +243,12 @@ describe('AdminPanelHealthService', () => {
{
queueName: 'queue1',
workers: 2,
metrics: {
active: 1,
completed: 10,
delayed: 0,
failed: 2,
waiting: 5,
prioritized: 1,
},
status: 'up',
},
{
queueName: 'queue2',
workers: 0,
metrics: {
active: 0,
completed: 5,
delayed: 0,
failed: 1,
waiting: 2,
prioritized: 0,
},
status: 'up',
},
];
@ -248,8 +268,8 @@ describe('AdminPanelHealthService', () => {
status: AdminPanelHealthServiceStatus.OPERATIONAL,
details: undefined,
queues: mockQueues.map((queue) => ({
...queue,
id: `worker-${queue.queueName}`,
queueName: queue.queueName,
status:
queue.workers > 0
? AdminPanelHealthServiceStatus.OPERATIONAL
@ -281,4 +301,305 @@ describe('AdminPanelHealthService', () => {
).rejects.toThrow('Health indicator not found: invalid');
});
});
describe('getQueueMetrics', () => {
const mockQueue = {
getMetrics: jest.fn(),
getWorkers: jest.fn(),
close: jest.fn(),
};
beforeEach(() => {
jest.clearAllMocks();
redisClient.getClient.mockReturnValue({} as Redis);
(Queue as unknown as jest.Mock).mockImplementation(() => mockQueue);
});
it('should return metrics data for a queue with correct data transformation', async () => {
const mockCompletedData = Array(240)
.fill(0)
.map((_, i) => i);
const mockFailedData = Array(240)
.fill(0)
.map((_, i) => i * 0.1);
workerHealth.getQueueDetails.mockResolvedValue({
queueName: 'test-queue',
workers: 1,
status: 'up',
metrics: {
active: 1,
completed: 30,
failed: 3,
waiting: 0,
delayed: 0,
failureRate: 9.1,
completedData: mockCompletedData,
failedData: mockFailedData,
},
});
const result = await service.getQueueMetrics(
MessageQueue.messagingQueue,
QueueMetricsTimeRange.FourHours,
);
expect(result).toMatchObject({
queueName: MessageQueue.messagingQueue,
timeRange: QueueMetricsTimeRange.FourHours,
workers: 1,
details: expect.any(Object),
data: expect.arrayContaining([
expect.objectContaining({
id: 'Completed Jobs',
data: expect.arrayContaining([
expect.objectContaining({
x: expect.any(Number),
y: expect.any(Number),
}),
]),
}),
expect.objectContaining({
id: 'Failed Jobs',
data: expect.arrayContaining([
expect.objectContaining({
x: expect.any(Number),
y: expect.any(Number),
}),
]),
}),
]),
});
});
it('should handle empty metrics data', async () => {
workerHealth.getQueueDetails.mockResolvedValue(null);
const result = await service.getQueueMetrics(
MessageQueue.messagingQueue,
QueueMetricsTimeRange.FourHours,
);
expect(result.data).toHaveLength(2);
expect(result.data[0].data).toHaveLength(240);
expect(result.data[1].data).toHaveLength(240);
});
it('should handle metrics service errors', async () => {
workerHealth.getQueueDetails.mockRejectedValue(
new Error('Metrics error'),
);
await expect(
service.getQueueMetrics(
MessageQueue.messagingQueue,
QueueMetricsTimeRange.FourHours,
),
).rejects.toThrow('Metrics error');
expect(loggerSpy).toHaveBeenCalledWith(
'Error getting metrics for messaging-queue: Metrics error',
);
});
describe('backfilling behavior', () => {
it('should handle partial data with correct historical backfilling', async () => {
// Test with 40 recent points for 4-hour range (needs 240 points)
const partialData = Array(40)
.fill(0)
.map((_, i) => i + 1);
workerHealth.getQueueDetails.mockResolvedValue({
queueName: 'test-queue',
workers: 1,
status: 'up',
metrics: {
failed: 0,
completed: 0,
waiting: 0,
active: 0,
delayed: 0,
failureRate: 0,
completedData: partialData,
failedData: partialData,
},
});
const result = await service.getQueueMetrics(
MessageQueue.messagingQueue,
QueueMetricsTimeRange.FourHours,
);
// Should have 240 total points
expect(result.data[0].data).toHaveLength(240);
// First 200 points should be zero (historical backfill)
const historicalPoints = result.data[0].data.slice(0, 200);
expect(historicalPoints.every((point) => point.y === 0)).toBe(true);
// Last 40 points should be actual data
const actualDataPoints = result.data[0].data.slice(200);
expect(actualDataPoints.every((point) => point.y > 0)).toBe(true);
// Verify chronological order (increasing values)
const nonZeroValues = actualDataPoints.map((point) => point.y);
for (let i = 1; i < nonZeroValues.length; i++) {
expect(nonZeroValues[i]).toBeGreaterThan(nonZeroValues[i - 1]);
}
});
it('should handle completely empty data with full backfilling', async () => {
workerHealth.getQueueDetails.mockResolvedValue({
queueName: 'test-queue',
workers: 1,
status: 'up',
metrics: {
failed: 0,
completed: 0,
waiting: 0,
active: 0,
delayed: 0,
failureRate: 0,
completedData: [],
failedData: [],
},
});
const result = await service.getQueueMetrics(
MessageQueue.messagingQueue,
QueueMetricsTimeRange.OneHour,
);
// Should have 60 points for one hour
expect(result.data[0].data).toHaveLength(60);
// All points should be zero
expect(result.data[0].data.every((point) => point.y === 0)).toBe(true);
});
});
describe('sampling behavior', () => {
it('should correctly sample data for different time ranges', async () => {
const testCases = [
{
timeRange: QueueMetricsTimeRange.OneHour,
expectedPoints: 60,
samplingFactor: 1,
},
{
timeRange: QueueMetricsTimeRange.FourHours,
expectedPoints: 240,
samplingFactor: 1,
},
{
timeRange: QueueMetricsTimeRange.OneDay,
expectedPoints: 240,
samplingFactor: 6,
},
];
for (const testCase of testCases) {
// Create test data with non-zero values
const testData = Array(testCase.expectedPoints * 2)
.fill(0)
.map((_, i) => i + 1); // Start from 1 to avoid zero values
workerHealth.getQueueDetails.mockResolvedValue({
queueName: 'test-queue',
workers: 1,
status: 'up',
metrics: {
failed: 0,
completed: 0,
waiting: 0,
active: 0,
delayed: 0,
failureRate: 0,
completedData: testData,
failedData: testData,
},
});
const result = await service.getQueueMetrics(
MessageQueue.messagingQueue,
testCase.timeRange,
);
expect(result.data[0].data).toHaveLength(testCase.expectedPoints);
if (testCase.samplingFactor > 1) {
const sampledData = result.data[0].data;
for (let i = 0; i < sampledData.length; i++) {
const start = i * testCase.samplingFactor;
const end = start + testCase.samplingFactor;
const originalDataSlice = testData.slice(start, end);
if (originalDataSlice.length > 0) {
// Add this check
const maxInSlice = Math.max(...originalDataSlice);
expect(sampledData[i].y).toBeLessThanOrEqual(maxInSlice);
}
}
}
}
});
});
});
describe('getPointsConfiguration', () => {
const testCases = [
{
timeRange: QueueMetricsTimeRange.OneHour,
expected: {
pointsNeeded: 60,
samplingFactor: 1,
targetVisualizationPoints: 240,
},
},
{
timeRange: QueueMetricsTimeRange.FourHours,
expected: {
pointsNeeded: 240,
samplingFactor: 1,
targetVisualizationPoints: 240,
},
},
{
timeRange: QueueMetricsTimeRange.TwelveHours,
expected: {
pointsNeeded: 720,
samplingFactor: 3,
targetVisualizationPoints: 240,
},
},
{
timeRange: QueueMetricsTimeRange.OneDay,
expected: {
pointsNeeded: 1440,
samplingFactor: 6,
targetVisualizationPoints: 240,
},
},
{
timeRange: QueueMetricsTimeRange.SevenDays,
expected: {
pointsNeeded: 10080,
samplingFactor: 42,
targetVisualizationPoints: 240,
},
},
];
testCases.forEach(({ timeRange, expected }) => {
it(`should return correct parameters for ${timeRange}`, () => {
const result = service['getPointsConfiguration'](timeRange as any);
expect(result).toEqual(expected);
});
});
});
});

View File

@ -1,23 +1,33 @@
import { Injectable } from '@nestjs/common';
import { Injectable, Logger } from '@nestjs/common';
import { HealthIndicatorResult, HealthIndicatorStatus } from '@nestjs/terminus';
import { Queue } from 'bullmq';
import { HEALTH_INDICATORS } from 'src/engine/core-modules/admin-panel/constants/health-indicators.constants';
import { AdminPanelHealthServiceData } from 'src/engine/core-modules/admin-panel/dtos/admin-panel-health-service-data.dto';
import { QueueMetricsData } from 'src/engine/core-modules/admin-panel/dtos/queue-metrics-data.dto';
import { SystemHealth } from 'src/engine/core-modules/admin-panel/dtos/system-health.dto';
import { AdminPanelHealthServiceStatus } from 'src/engine/core-modules/admin-panel/enums/admin-panel-health-service-status.enum';
import { QueueMetricsTimeRange } from 'src/engine/core-modules/admin-panel/enums/queue-metrics-time-range.enum';
import { HealthIndicatorId } from 'src/engine/core-modules/health/enums/health-indicator-id.enum';
import { ConnectedAccountHealth } from 'src/engine/core-modules/health/indicators/connected-account.health';
import { DatabaseHealthIndicator } from 'src/engine/core-modules/health/indicators/database.health';
import { RedisHealthIndicator } from 'src/engine/core-modules/health/indicators/redis.health';
import { WorkerHealthIndicator } from 'src/engine/core-modules/health/indicators/worker.health';
import { WorkerQueueHealth } from 'src/engine/core-modules/health/types/worker-queue-health.type';
import { MessageQueue } from 'src/engine/core-modules/message-queue/message-queue.constants';
import { RedisClientService } from 'src/engine/core-modules/redis-client/redis-client.service';
@Injectable()
export class AdminPanelHealthService {
private readonly logger = new Logger(AdminPanelHealthService.name);
constructor(
private readonly databaseHealth: DatabaseHealthIndicator,
private readonly redisHealth: RedisHealthIndicator,
private readonly workerHealth: WorkerHealthIndicator,
private readonly connectedAccountHealth: ConnectedAccountHealth,
private readonly redisClient: RedisClientService,
) {}
private readonly healthIndicators = {
@ -93,8 +103,8 @@ export class AdminPanelHealthService {
return {
...indicatorStatus,
queues: (indicatorStatus?.queues ?? []).map((queue) => ({
...queue,
id: `${indicatorId}-${queue.queueName}`,
queueName: queue.queueName,
status:
queue.workers > 0
? AdminPanelHealthServiceStatus.OPERATIONAL
@ -144,4 +154,166 @@ export class AdminPanelHealthService {
],
};
}
async getQueueMetrics(
queueName: MessageQueue,
timeRange: QueueMetricsTimeRange = QueueMetricsTimeRange.OneDay,
): Promise<QueueMetricsData> {
const redis = this.redisClient.getClient();
const queue = new Queue(queueName, { connection: redis });
try {
const { pointsNeeded, samplingFactor } =
this.getPointsConfiguration(timeRange);
const queueDetails = await this.workerHealth.getQueueDetails(queueName, {
pointsNeeded,
});
const completedMetricsArray = queueDetails?.metrics?.completedData;
const failedMetricsArray = queueDetails?.metrics?.failedData;
const completedMetrics = this.extractMetricsData(
completedMetricsArray,
pointsNeeded,
samplingFactor,
);
const failedMetrics = this.extractMetricsData(
failedMetricsArray,
pointsNeeded,
samplingFactor,
);
return this.transformMetricsForGraph(
completedMetrics,
failedMetrics,
timeRange,
queueName,
queueDetails,
);
} catch (error) {
this.logger.error(
`Error getting metrics for ${queueName}: ${error.message}`,
);
throw error;
} finally {
await queue.close();
}
}
private getPointsConfiguration(timeRange: QueueMetricsTimeRange): {
pointsNeeded: number;
samplingFactor: number;
targetVisualizationPoints: number;
} {
const targetVisualizationPoints = 240;
let pointsNeeded: number;
switch (timeRange) {
case QueueMetricsTimeRange.OneHour:
pointsNeeded = 60; // 60 points (1 hour)
break;
case QueueMetricsTimeRange.FourHours:
pointsNeeded = 4 * 60; // 240 points (4 hours)
break;
case QueueMetricsTimeRange.TwelveHours:
pointsNeeded = 12 * 60; // 720 points (12 hours)
break;
case QueueMetricsTimeRange.OneDay:
pointsNeeded = 24 * 60; // 1440 points (24 hours)
break;
case QueueMetricsTimeRange.SevenDays:
pointsNeeded = 7 * 24 * 60; // 10080 points (7 days)
break;
default:
pointsNeeded = 24 * 60; // Default to 1 day
}
const samplingFactor =
pointsNeeded <= targetVisualizationPoints
? 1
: Math.ceil(pointsNeeded / targetVisualizationPoints);
return {
pointsNeeded,
samplingFactor,
targetVisualizationPoints,
};
}
private extractMetricsData(
metrics: number[] | undefined,
pointsNeeded: number,
samplingFactor = 1,
): number[] {
if (!metrics || !Array.isArray(metrics)) {
return Array(Math.ceil(pointsNeeded / samplingFactor)).fill(0);
}
try {
const targetPoints = Math.ceil(pointsNeeded / samplingFactor);
const relevantData = metrics.slice(-pointsNeeded);
const result: number[] = [];
const backfillCount = Math.max(
0,
targetPoints - Math.ceil(relevantData.length / samplingFactor),
);
result.push(...Array(backfillCount).fill(0));
for (let i = 0; i < relevantData.length; i += samplingFactor) {
const chunk = relevantData.slice(i, i + samplingFactor);
result.push(Math.max(...chunk));
}
return result.slice(0, targetPoints);
} catch (error) {
this.logger.error(`Error extracting metrics data: ${error.message}`);
throw error;
}
}
private transformMetricsForGraph(
completedMetrics: number[],
failedMetrics: number[],
timeRange: QueueMetricsTimeRange,
queueName: MessageQueue,
queueDetails: WorkerQueueHealth | null,
): QueueMetricsData {
try {
return {
queueName,
timeRange,
details: queueDetails?.metrics ?? null,
workers: queueDetails?.workers ?? 0,
data: [
{
id: 'Completed Jobs',
data: completedMetrics.map((count, index) => ({
x: index,
y: count,
})),
},
{
id: 'Failed Jobs',
data: failedMetrics.map((count, index) => ({
x: index,
y: count,
})),
},
],
};
} catch (error) {
this.logger.error(
`Error transforming metrics for graph: ${error.message}`,
);
throw error;
}
}
}

View File

@ -10,20 +10,24 @@ import { SystemHealth } from 'src/engine/core-modules/admin-panel/dtos/system-he
import { UpdateWorkspaceFeatureFlagInput } from 'src/engine/core-modules/admin-panel/dtos/update-workspace-feature-flag.input';
import { UserLookup } from 'src/engine/core-modules/admin-panel/dtos/user-lookup.entity';
import { UserLookupInput } from 'src/engine/core-modules/admin-panel/dtos/user-lookup.input';
import { QueueMetricsTimeRange } from 'src/engine/core-modules/admin-panel/enums/queue-metrics-time-range.enum';
import { AuthGraphqlApiExceptionFilter } from 'src/engine/core-modules/auth/filters/auth-graphql-api-exception.filter';
import { HealthIndicatorId } from 'src/engine/core-modules/health/enums/health-indicator-id.enum';
import { WorkerHealthIndicator } from 'src/engine/core-modules/health/indicators/worker.health';
import { MessageQueue } from 'src/engine/core-modules/message-queue/message-queue.constants';
import { ImpersonateGuard } from 'src/engine/guards/impersonate-guard';
import { UserAuthGuard } from 'src/engine/guards/user-auth.guard';
import { WorkspaceAuthGuard } from 'src/engine/guards/workspace-auth.guard';
import { AdminPanelHealthServiceData } from './dtos/admin-panel-health-service-data.dto';
import { QueueMetricsData } from './dtos/queue-metrics-data.dto';
@Resolver()
@UseFilters(AuthGraphqlApiExceptionFilter)
export class AdminPanelResolver {
constructor(
private adminService: AdminPanelService,
private adminPanelHealthService: AdminPanelHealthService,
private workerHealthIndicator: WorkerHealthIndicator,
) {}
@UseGuards(WorkspaceAuthGuard, UserAuthGuard, ImpersonateGuard)
@ -68,6 +72,7 @@ export class AdminPanelResolver {
return this.adminPanelHealthService.getSystemHealthStatus();
}
@UseGuards(WorkspaceAuthGuard, UserAuthGuard, ImpersonateGuard)
@Query(() => AdminPanelHealthServiceData)
async getIndicatorHealthStatus(
@Args('indicatorId', {
@ -77,4 +82,22 @@ export class AdminPanelResolver {
): Promise<AdminPanelHealthServiceData> {
return this.adminPanelHealthService.getIndicatorHealthStatus(indicatorId);
}
@UseGuards(WorkspaceAuthGuard, UserAuthGuard, ImpersonateGuard)
@Query(() => QueueMetricsData)
async getQueueMetrics(
@Args('queueName', { type: () => String })
queueName: string,
@Args('timeRange', {
nullable: true,
defaultValue: QueueMetricsTimeRange.OneDay,
type: () => QueueMetricsTimeRange,
})
timeRange: QueueMetricsTimeRange = QueueMetricsTimeRange.OneHour,
): Promise<QueueMetricsData> {
return await this.adminPanelHealthService.getQueueMetrics(
queueName as MessageQueue,
timeRange,
);
}
}

View File

@ -1,13 +1,15 @@
import { Field, ObjectType } from '@nestjs/graphql';
import { AdminPanelHealthServiceStatus } from 'src/engine/core-modules/admin-panel/enums/admin-panel-health-service-status.enum';
import { WorkerQueueHealth } from 'src/engine/core-modules/health/types/worker-queue-health.type';
@ObjectType()
export class AdminPanelWorkerQueueHealth extends WorkerQueueHealth {
export class AdminPanelWorkerQueueHealth {
@Field(() => String)
id: string;
@Field(() => String)
queueName: string;
@Field(() => AdminPanelHealthServiceStatus)
status: AdminPanelHealthServiceStatus;
}

View File

@ -0,0 +1,10 @@
import { Field, ObjectType } from '@nestjs/graphql';
@ObjectType()
export class QueueMetricsDataPoint {
@Field(() => Number)
x: number;
@Field(() => Number)
y: number;
}

View File

@ -0,0 +1,23 @@
import { Field, ObjectType } from '@nestjs/graphql';
import { QueueMetricsSeries } from 'src/engine/core-modules/admin-panel/dtos/queue-metrics-series.dto';
import { QueueMetricsTimeRange } from 'src/engine/core-modules/admin-panel/enums/queue-metrics-time-range.enum';
import { WorkerQueueMetrics } from 'src/engine/core-modules/health/types/worker-queue-metrics.type';
@ObjectType()
export class QueueMetricsData {
@Field(() => String)
queueName: string;
@Field(() => Number)
workers: number;
@Field(() => QueueMetricsTimeRange)
timeRange: QueueMetricsTimeRange;
@Field(() => WorkerQueueMetrics, { nullable: true })
details: WorkerQueueMetrics | null;
@Field(() => [QueueMetricsSeries])
data: QueueMetricsSeries[];
}

View File

@ -0,0 +1,12 @@
import { Field, ObjectType } from '@nestjs/graphql';
import { QueueMetricsDataPoint } from 'src/engine/core-modules/admin-panel/dtos/queue-metrics-data-point.dto';
@ObjectType()
export class QueueMetricsSeries {
@Field()
id: string;
@Field(() => [QueueMetricsDataPoint])
data: QueueMetricsDataPoint[];
}

View File

@ -0,0 +1,13 @@
import { registerEnumType } from '@nestjs/graphql';
export enum QueueMetricsTimeRange {
SevenDays = '7D',
OneDay = '1D',
TwelveHours = '12H',
FourHours = '4H',
OneHour = '1H',
}
registerEnumType(QueueMetricsTimeRange, {
name: 'QueueMetricsTimeRange',
});