Commit 22cd5747 authored by Dmytro Bogatov's avatar Dmytro Bogatov 💕
Browse files

Add discreapncy type low health.

parent 802a0ca3
Pipeline #534 passed with stages
in 6 minutes and 20 seconds
......@@ -11,6 +11,12 @@ type JsonHealthDataPoint = {
Data: HealthReportDataPoint[];
}
/**
* Model for individual metric health (status)
*
* @export
* @class HealthReportDataPoint
*/
export class HealthReportDataPoint {
public label: string;
public source: string;
......@@ -34,6 +40,12 @@ export class HealthDataPoint extends DataPoint {
*/
public health: number;
/**
* The detailed data used to compute overall health
*
* @type {HealthReportDataPoint[]}
* @memberof HealthDataPoint
*/
public data: HealthReportDataPoint[];
......
......@@ -20,6 +20,9 @@ ServiceManager:
Interval: 10
Gaps:
MaxDifference: 5
Health:
Threshold: 90
MaxFailures: 3
NotificationService:
Verbosity: "less"
Logging:
......
......@@ -90,6 +90,9 @@ ServiceManager:
Load:
Threshold: 90
MaxFailures: 5
Health:
Threshold: 90
MaxFailures: 3
NotificationService:
Enabled: true
Interval: 30
......
......@@ -72,6 +72,15 @@ namespace StatusMonitor.Daemons.Services
/// <param name="ago">Time ago from now which defines a timeframe of data to analyze</param>
/// <returns>A list of found discrepancies</returns>
Task<List<Discrepancy>> FindPingFailuresAsync(Metric metric, TimeSpan ago);
/// <summary>
/// Returns a list of discrepancies of type DiscrepancyType.LowHealth
/// in the data for given metric for given timeframe.
/// </summary>
/// <param name="metric">Metric for which to find discrepancies. Must be of type Metrics.Health.</param>
/// <param name="ago">Time ago from now which defines a timeframe of data to analyze</param>
/// <returns>A list of found discrepancies</returns>
Task<List<Discrepancy>> FindLowHealthsAsync(Metric metric, TimeSpan ago);
}
public class DiscrepancyService : IDiscrepancyService
......@@ -292,6 +301,26 @@ namespace StatusMonitor.Daemons.Services
return result;
}
public async Task<List<Discrepancy>> FindLowHealthsAsync(Metric metric, TimeSpan ago)
{
if (metric.Type != Metrics.Health.AsInt())
{
throw new ArgumentException($"Metric for FindLowHealthsAsync has to be of type {Metrics.Health}");
}
_context.Metrics.Attach(metric);
var timeAgo = DateTime.UtcNow - ago;
return FindLowHealthInDataPoints(
await _context
.HealthReports
.Where(dp => dp.Metric == metric && dp.Timestamp >= timeAgo)
.ToListAsync(),
metric
);
}
/// <summary>
/// Traverses the lists of ping datapoints looking for places where ping failed a number of times defined
/// in setting.
......@@ -420,6 +449,72 @@ namespace StatusMonitor.Daemons.Services
.ToList();
}
/// <summary>
/// Traverses the lists of health datapoints looking for places where the value is less than
/// the value defined in config ServiceManager:DiscrepancyService:Health:Threshold for more than
/// ServiceManager:DiscrepancyService:Health:MaxFailures consecutive datapoints.
/// </summary>
/// <param name="datapoints">List of numeric datapoints to traverse</param>
/// <param name="metric">Metric which will appear in a resulting discrepancy object</param>
/// <returns>A list of discrepancies of type DiscrepancyType.LowHealth found</returns>
internal List<Discrepancy> FindLowHealthInDataPoints(List<HealthReport> datapoints, Metric metric)
{
if (datapoints.Count() == 0)
{
return new List<Discrepancy>();
}
var healths = datapoints
.Select(d => new
{
NormalHealth = d.Health >= Convert.ToInt32(_config["ServiceManager:DiscrepancyService:Health:Threshold"]),
Timestamp = d.Timestamp
});
if (!healths.Any(l => l.NormalHealth))
{
// Means that health is low for the whole timeframe
// Discrepancy has been noticed already
return new List<Discrepancy>();
}
return healths
.OrderBy(p => p.Timestamp)
.SkipWhile(p => !p.NormalHealth)
.Aggregate(
new Stack<BoolIntDateTuple>(),
(rest, self) =>
{
if (rest.Count == 0 || rest.Peek().StateGood != self.NormalHealth)
{
rest.Push(new BoolIntDateTuple
{
StateGood = self.NormalHealth,
DateFirstOffense = self.Timestamp
});
}
else
{
rest.Peek().Count++;
}
return rest;
}
)
.Where(
t =>
!t.StateGood &&
t.Count > Convert.ToInt32(_config["ServiceManager:DiscrepancyService:Health:MaxFailures"])
)
.Select(x => new Discrepancy
{
DateFirstOffense = x.DateFirstOffense,
Type = DiscrepancyType.LowHealth,
MetricType = (Metrics)metric.Type,
MetricSource = metric.Source
})
.ToList();
}
public async Task<IEnumerable<Discrepancy>> FindResolvedDiscrepanciesAsync(IEnumerable<Discrepancy> discrepancies)
{
var resolvedDiscrepancies = new List<Discrepancy>();
......@@ -464,6 +559,21 @@ namespace StatusMonitor.Daemons.Services
resolvedDiscrepancies.Add(discrepancy);
}
break;
case DiscrepancyType.LowHealth:
if (
await _context
.HealthReports
.Where(dp =>
dp.Metric.Source == discrepancy.MetricSource &&
dp.Metric.Type == discrepancy.MetricType.AsInt() &&
dp.Timestamp > discrepancy.DateFirstOffense
)
.AnyAsync(dp => dp.Health >= Convert.ToInt32(_config["ServiceManager:DiscrepancyService:Health:Threshold"]))
)
{
resolvedDiscrepancies.Add(discrepancy);
}
break;
case DiscrepancyType.PingFailedNTimes:
if (
await _context
......
......@@ -414,6 +414,11 @@ namespace StatusMonitor.Daemons.Services
.Where(mt => mt.Type == Metrics.Ping.AsInt())
.ToListAsync();
var healthMetrics = await context
.Metrics
.Where(mt => mt.Type == Metrics.Health.AsInt())
.ToListAsync();
// Find GapInData discrepancies
var gapTasks = cpuMetrics
.Select(metric =>
......@@ -458,6 +463,28 @@ namespace StatusMonitor.Daemons.Services
})
);
// Find LowHealth discrepancies
var lowHealthTasks = healthMetrics
.Select(metric =>
Task.Run(async () =>
{
using (var scope = _serviceProvider.CreateScope())
{
return await scope
.ServiceProvider
.GetRequiredService<IDiscrepancyService>()
.FindLowHealthsAsync(
metric,
new TimeSpan(
0,
0,
Convert.ToInt32(_config["ServiceManager:DiscrepancyService:DataTimeframe"])
)
);
}
})
);
// Find PingFailedNTimes discrepancies
var pingTasks = pingMetrics
.Select(metric =>
......@@ -485,7 +512,8 @@ namespace StatusMonitor.Daemons.Services
new Task<List<Discrepancy>>[] { }
.Concat(gapTasks).ToArray()
.Concat(pingTasks).ToArray()
.Concat(highLoadTasks).ToArray())
.Concat(highLoadTasks).ToArray()
.Concat(lowHealthTasks).ToArray())
;
using (var scope = _serviceProvider.CreateScope())
......
......@@ -11,8 +11,8 @@ namespace StatusMonitor.Shared.Models.Entities
{
[Key]
public int Id { get; set; }
public DateTime Timestamp { get; set; } = DateTime.UtcNow;
public Metric Metric { get; set; }
public virtual DateTime Timestamp { get; set; } = DateTime.UtcNow;
public virtual Metric Metric { get; set; }
/// <summary>
/// Generates an object which contains the fields of DataPoint visible to public
......
......@@ -33,6 +33,8 @@ namespace StatusMonitor.Shared.Models.Entities
return $"Gap in data from {MetricSource} has been detected. The gap starts on {DateFirstOffense.ToStringUsingTimeZone(timeZoneId)}.";
case DiscrepancyType.HighLoad:
return $"{MetricSource} reported high load starting from {DateFirstOffense.ToStringUsingTimeZone(timeZoneId)}.";
case DiscrepancyType.LowHealth:
return $"{MetricSource} reported low health starting from {DateFirstOffense.ToStringUsingTimeZone(timeZoneId)}.";
case DiscrepancyType.PingFailedNTimes:
return $"Requests to {MetricSource} failed too many consecutive times. First failure occurred on {DateFirstOffense.ToStringUsingTimeZone(timeZoneId)}.";
default:
......@@ -45,6 +47,6 @@ namespace StatusMonitor.Shared.Models.Entities
public enum DiscrepancyType
{
GapInData, PingFailedNTimes, HighLoad
GapInData, PingFailedNTimes, HighLoad, LowHealth
}
}
......@@ -55,6 +55,7 @@ namespace StatusMonitor.Web.TagHelpers
{ "Admin password", _config["Secrets:AdminPassword"] },
{ "Database connection string", _config["Secrets:ConnectionString"] },
{ "ReCaptcha", GenerateReCaptchaInfo() },
{ "Google Analytics Tracking Id", _config["Secrets:GoogleAnalytics:TrackingId"] },
{ "Email", GenerateEmailInfo() },
{ "Slack", GenerateSlackInfo() }
})}
......@@ -91,6 +92,7 @@ namespace StatusMonitor.Web.TagHelpers
{ "Cache service", GenerateCacheServiceInfo() },
{ "Clean service", GenerateCleanServiceInfo() },
{ "Ping service", GeneratePingServiceInfo() },
{ "Health service", GenerateHealthServiceInfo() },
{ "Demo service", GenerateDemoServiceInfo() },
{ "Discrepancy service", GenerateDiscrepancyServiceInfo() },
{ "Notification service", GenerateNotificationServiceInfo() }
......@@ -156,7 +158,7 @@ namespace StatusMonitor.Web.TagHelpers
private string GenerateEnabledDisabled(string setting)
{
return Convert.ToBoolean(_config[setting]) ? "Enabled": "Disbaled";
return Convert.ToBoolean(_config[setting]) ? "Enabled" : "Disbaled";
}
private string GenerateEnabledDisabledInfo(string service, string setting, string info)
......@@ -164,8 +166,8 @@ namespace StatusMonitor.Web.TagHelpers
return $@"
{service} is {GenerateEnabledDisabled(setting).ToLower()}.
{(
Convert.ToBoolean(_config[setting]) ?
info :
Convert.ToBoolean(_config[setting]) ?
info :
""
)}";
}
......@@ -232,6 +234,15 @@ namespace StatusMonitor.Web.TagHelpers
);
}
private string GenerateHealthServiceInfo()
{
return GenerateEnabledDisabledInfo(
"Health service",
"ServiceManager:HealthService:Enabled",
$"The interval is {_config["ServiceManager:HealthService:Interval"]} seconds."
);
}
private string GenerateCleanServiceInfo()
{
return GenerateEnabledDisabledInfo(
......@@ -264,7 +275,8 @@ namespace StatusMonitor.Web.TagHelpers
The interval is {_config["ServiceManager:DiscrepancyService:Interval"]}.
Service will analyze {_config["ServiceManager:DiscrepancyService:DataTimeframe"]} seconds of data per run.
'Gap in data' discrepancy will be reported if time difference between any two consecutive datapoints is more than 1.5x of {_config["ServiceManager:DiscrepancyService:Gaps:MaxDifference"]} seconds.
'High load' discrepancy will be reported if load of {_config["ServiceManager:DiscrepancyService:Load:Threshold"]}+ occurs for more than {_config["ServiceManager:DiscrepancyService:Load:MaxFailures"]} consecutive recordings."
'High load' discrepancy will be reported if load of {_config["ServiceManager:DiscrepancyService:Load:Threshold"]}+ occurs for more than {_config["ServiceManager:DiscrepancyService:Load:MaxFailures"]} consecutive recordings.
'Low health' discrepancy will be reported if load of less than {_config["ServiceManager:DiscrepancyService:Health:Threshold"]} occurs for more than {_config["ServiceManager:DiscrepancyService:Health:MaxFailures"]} consecutive recordings."
);
}
......@@ -279,15 +291,18 @@ namespace StatusMonitor.Web.TagHelpers
Enum
.GetValues(typeof(NotificationSeverity))
.Cast<object>()
.Select(obj => new {
.Select(obj => new
{
Severity = obj.ToString(),
Frequency = _config[$"ServiceManager:NotificationService:Frequencies:{obj.ToString()}"]
Frequency = _config[$"ServiceManager:NotificationService:Frequencies:{obj.ToString()}"]
})
.Select(obj => $"Recipient will get notifications of severity {obj.Severity} no more than once in {obj.Frequency} seconds.")
.Aggregate(
(self, next) => $"{next}{Environment.NewLine}{self}"
)
}
Timezone is set to '{_config[$"ServiceManager:NotificationService:TimeZone"]}'.
Verbosity is set to '{_config[$"ServiceManager:NotificationService:Verbosity"]}'.
"
);
}
......
......@@ -119,6 +119,8 @@ namespace StatusMonitor.Tests.UnitTests
TestIntKey("ServiceManager:DiscrepancyService:Gaps:MaxDifference");
TestIntKey("ServiceManager:DiscrepancyService:Load:Threshold");
TestIntKey("ServiceManager:DiscrepancyService:Load:MaxFailures");
TestIntKey("ServiceManager:DiscrepancyService:Health:Threshold");
TestIntKey("ServiceManager:DiscrepancyService:Health:MaxFailures");
TestIntKey($"ServiceManager:NotificationService:Frequencies:{NotificationSeverity.Low.ToString()}");
TestIntKey($"ServiceManager:NotificationService:Frequencies:{NotificationSeverity.Medium.ToString()}");
......
......@@ -148,6 +148,7 @@ namespace StatusMonitor.Tests.UnitTests.Services
[Theory]
[InlineData(Metrics.CpuLoad)]
[InlineData(Metrics.Ping)]
[InlineData(Metrics.Health)]
public async Task LabelForTooFewPoints(Metrics type)
{
// Arrange
......@@ -174,6 +175,13 @@ namespace StatusMonitor.Tests.UnitTests.Services
new NumericDataPoint { Value = 43, Metric = metric }
);
break;
case Metrics.Health:
await context.HealthReports.AddRangeAsync(
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(true, metric)
);
break;
case Metrics.Ping:
await context.PingSettings.AddAsync(
new PingSetting
......@@ -233,6 +241,7 @@ namespace StatusMonitor.Tests.UnitTests.Services
[Theory]
[InlineData(Metrics.CpuLoad)]
[InlineData(Metrics.Ping)]
[InlineData(Metrics.Health)]
public async Task LabelNormal(Metrics type)
{
// Arrange
......@@ -261,6 +270,15 @@ namespace StatusMonitor.Tests.UnitTests.Services
new NumericDataPoint { Value = 34, Metric = metric }
);
break;
case Metrics.Health:
await context.HealthReports.AddRangeAsync(
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(true, metric)
);
break;
case Metrics.Ping:
await context.PingSettings.AddAsync(
new PingSetting
......@@ -351,6 +369,7 @@ namespace StatusMonitor.Tests.UnitTests.Services
[Theory]
[InlineData(Metrics.CpuLoad)]
[InlineData(Metrics.Ping)]
[InlineData(Metrics.Health)]
public async Task LabelWarning(Metrics type)
{
// Arrange
......@@ -379,6 +398,15 @@ namespace StatusMonitor.Tests.UnitTests.Services
new NumericDataPoint { Value = 90, Metric = metric }
);
break;
case Metrics.Health:
await context.HealthReports.AddRangeAsync(
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(false, metric),
DiscrepancyServiceTest.GenerateHealthReport(false, metric)
);
break;
case Metrics.Ping:
await context.PingSettings.AddAsync(
new PingSetting
......@@ -467,6 +495,7 @@ namespace StatusMonitor.Tests.UnitTests.Services
[Theory]
[InlineData(Metrics.CpuLoad)]
[InlineData(Metrics.Ping)]
[InlineData(Metrics.Health)]
public async Task LabelCritical(Metrics type)
{
// Arrange
......@@ -495,6 +524,15 @@ namespace StatusMonitor.Tests.UnitTests.Services
new NumericDataPoint { Value = 100, Metric = metric }
);
break;
case Metrics.Health:
await context.HealthReports.AddRangeAsync(
DiscrepancyServiceTest.GenerateHealthReport(true, metric),
DiscrepancyServiceTest.GenerateHealthReport(false, metric),
DiscrepancyServiceTest.GenerateHealthReport(false, metric),
DiscrepancyServiceTest.GenerateHealthReport(false, metric),
DiscrepancyServiceTest.GenerateHealthReport(false, metric)
);
break;
case Metrics.Ping:
await context.PingSettings.AddAsync(
new PingSetting
......
......@@ -146,6 +146,64 @@ namespace StatusMonitor.Tests.UnitTests.Services
}
}
[Theory]
[InlineData(true)]
[InlineData(false)]
public async Task ResolvesLowHealth(bool shouldResolve)
{
// Arrange
var input = new List<Discrepancy> {
new Discrepancy {
DateFirstOffense = DateTime.UtcNow,
Type = DiscrepancyType.LowHealth,
MetricSource = "the-source-1",
MetricType = Metrics.Health,
}
};
var metric = new Metric
{
Type = Metrics.Health.AsInt(),
Source = "the-source-1"
};
var context = _serviceProvider.GetRequiredService<IDataContext>();
await context.Metrics.AddAsync(metric);
await context.HealthReports.AddRangeAsync(
DiscrepancyServiceTest.GenerateHealthReport(true, metric, DateTime.UtcNow.AddMinutes(-1)),
DiscrepancyServiceTest.GenerateHealthReport(false, metric, DateTime.UtcNow.AddMinutes(1)),
DiscrepancyServiceTest.GenerateHealthReport(false, metric, DateTime.UtcNow.AddMinutes(2)),
DiscrepancyServiceTest.GenerateHealthReport(shouldResolve, metric, DateTime.UtcNow.AddMinutes(3))
);
await context.SaveChangesAsync();
var config = new Mock<IConfiguration>();
config
.SetupGet(conf => conf["ServiceManager:DiscrepancyService:Health:Threshold"])
.Returns(99.ToString());
var discrepancyService = new DiscrepancyService(
new Mock<ILogger<DiscrepancyService>>().Object,
context,
new Mock<INotificationService>().Object,
config.Object
);
// Act
var actual = await discrepancyService.FindResolvedDiscrepanciesAsync(input);
// Assert
if (shouldResolve)
{
Assert.NotEmpty(actual);
Assert.Equal(input, actual);
}
else
{
Assert.Empty(actual);
}
}
[Theory]
[InlineData(true)]
[InlineData(false)]
......
......@@ -334,6 +334,7 @@ namespace StatusMonitor.Tests.UnitTests.Services
[InlineData(Metrics.Log, false)]
[InlineData(Metrics.Ping, false)]
[InlineData(Metrics.UserAction, false)]
[InlineData(Metrics.Health, false)]
public async Task VerifyMetricForGaps(Metrics type, bool shouldSucceed)
{
// Arrange
......
......@@ -532,6 +532,7 @@ namespace StatusMonitor.Tests.UnitTests.Services
[InlineData(Metrics.Log, false)]
[InlineData(Metrics.Ping, false)]
[InlineData(Metrics.UserAction, false)]
[InlineData(Metrics.Health, false)]
public async Task VerifyMetricForHighLoad(Metrics type, bool shouldSucceed)
{
// Arrange
......
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using Moq;
using StatusMonitor.Daemons.Services;
using StatusMonitor.Shared.Extensions;
using StatusMonitor.Shared.Models;
using StatusMonitor.Shared.Models.Entities;
using StatusMonitor.Shared.Services;
using Xunit;
namespace StatusMonitor.Tests.UnitTests.Services
{
public partial class DiscrepancyServiceTest
{
public static HealthReport GenerateHealthReport(bool good, Metric metric = null, DateTime? timestamp = null)
{
return
new HealthReport
{
Data = new List<HealthReportDataPoint> {
new HealthReportDataPoint { Label = AutoLabels.Normal.ToString() },
new HealthReportDataPoint {
Label = (good ? AutoLabels.Normal : AutoLabels.Critical).ToString()
},
},
Timestamp = timestamp.HasValue ? timestamp.Value : DateTime.UtcNow,
Metric = metric ?? new Metric
{
Type = Metrics.Health.AsInt(),
Source = "smth.com"
}
};
}
private HealthReport ConfigureHealthReport(int health, DateTime timestamp, Metric metric = null)
{
var report = new Mock<HealthReport>();
report
.SetupGet(r => r.Health)
.Returns(health);
report
.Setup(r => r.Timestamp)
.Returns(timestamp);
report
.Setup(r => r.Metric)
.Returns(metric ?? new Metric
{
Type = Metrics.Health.AsInt(),
Source = "the-source"
});
return report.Object;
}
[Fact]
public void SingleLowHealthInData()
{
// Arrange
var mockConfig = new Mock<IConfiguration>();
mockConfig