Implement monitor health check
This commit is contained in:
parent
fe4ef6b05d
commit
e27e355b75
|
@ -35,6 +35,7 @@ func reloadLogListInterval() time.Duration {
|
||||||
}
|
}
|
||||||
|
|
||||||
type task struct {
|
type task struct {
|
||||||
|
log *loglist.Log
|
||||||
stop context.CancelFunc
|
stop context.CancelFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,9 +45,27 @@ type daemon struct {
|
||||||
tasks map[LogID]task
|
tasks map[LogID]task
|
||||||
logsLoadedAt time.Time
|
logsLoadedAt time.Time
|
||||||
logListToken *loglist.ModificationToken
|
logListToken *loglist.ModificationToken
|
||||||
|
logListError string
|
||||||
|
logListErrorAt time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func (daemon *daemon) healthCheck(ctx context.Context) error { // TODO-2
|
func (daemon *daemon) healthCheck(ctx context.Context) error {
|
||||||
|
if time.Since(daemon.logsLoadedAt) >= healthCheckInterval {
|
||||||
|
if err := notify(ctx, daemon.config, &staleLogListEvent{
|
||||||
|
Source: daemon.config.LogListSource,
|
||||||
|
LastSuccess: daemon.logsLoadedAt,
|
||||||
|
LastError: daemon.logListError,
|
||||||
|
LastErrorTime: daemon.logListErrorAt,
|
||||||
|
}); err != nil {
|
||||||
|
return fmt.Errorf("error notifying about stale log list: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, task := range daemon.tasks {
|
||||||
|
if err := healthCheckLog(ctx, daemon.config, task.log); err != nil {
|
||||||
|
return fmt.Errorf("error checking health of log %q: %w", task.log.URL, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,7 +83,7 @@ func (daemon *daemon) startTask(ctx context.Context, ctlog *loglist.Log) task {
|
||||||
return fmt.Errorf("error while monitoring %s: %w", ctlog.URL, err)
|
return fmt.Errorf("error while monitoring %s: %w", ctlog.URL, err)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
return task{stop: cancel}
|
return task{log: ctlog, stop: cancel}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (daemon *daemon) loadLogList(ctx context.Context) error {
|
func (daemon *daemon) loadLogList(ctx context.Context) error {
|
||||||
|
@ -124,6 +143,8 @@ func (daemon *daemon) run(ctx context.Context) error {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
case <-reloadLogListTicker.C:
|
case <-reloadLogListTicker.C:
|
||||||
if err := daemon.loadLogList(ctx); err != nil {
|
if err := daemon.loadLogList(ctx); err != nil {
|
||||||
|
daemon.logListError = err.Error()
|
||||||
|
daemon.logListErrorAt = time.Now()
|
||||||
recordError(fmt.Errorf("error reloading log list (will try again later): %w", err))
|
recordError(fmt.Errorf("error reloading log list (will try again later): %w", err))
|
||||||
}
|
}
|
||||||
reloadLogListTicker.Reset(reloadLogListInterval())
|
reloadLogListTicker.Reset(reloadLogListInterval())
|
||||||
|
|
|
@ -10,7 +10,11 @@
|
||||||
package monitor
|
package monitor
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io/fs"
|
||||||
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
@ -18,6 +22,49 @@ import (
|
||||||
"software.sslmate.com/src/certspotter/loglist"
|
"software.sslmate.com/src/certspotter/loglist"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func healthCheckLog(ctx context.Context, config *Config, ctlog *loglist.Log) error {
|
||||||
|
var (
|
||||||
|
stateDirPath = filepath.Join(config.StateDir, "logs", ctlog.LogID.Base64URLString())
|
||||||
|
stateFilePath = filepath.Join(stateDirPath, "state.json")
|
||||||
|
sthsDirPath = filepath.Join(stateDirPath, "unverified_sths")
|
||||||
|
)
|
||||||
|
state, err := loadStateFile(stateFilePath)
|
||||||
|
if errors.Is(err, fs.ErrNotExist) {
|
||||||
|
return nil
|
||||||
|
} else if err != nil {
|
||||||
|
return fmt.Errorf("error loading state file: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if time.Since(state.LastSuccess) < healthCheckInterval {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
sths, err := loadSTHsFromDir(sthsDirPath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error loading STHs directory: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(sths) == 0 {
|
||||||
|
if err := notify(ctx, config, &staleSTHEvent{
|
||||||
|
Log: ctlog,
|
||||||
|
LastSuccess: state.LastSuccess,
|
||||||
|
LatestSTH: state.VerifiedSTH,
|
||||||
|
}); err != nil {
|
||||||
|
return fmt.Errorf("error notifying about stale STH: %w", err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if err := notify(ctx, config, &backlogEvent{
|
||||||
|
Log: ctlog,
|
||||||
|
LatestSTH: sths[len(sths)-1],
|
||||||
|
Position: state.DownloadPosition.Size(),
|
||||||
|
}); err != nil {
|
||||||
|
return fmt.Errorf("error notifying about backlog: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
type staleSTHEvent struct {
|
type staleSTHEvent struct {
|
||||||
Log *loglist.Log
|
Log *loglist.Log
|
||||||
LastSuccess time.Time
|
LastSuccess time.Time
|
||||||
|
@ -26,7 +73,6 @@ type staleSTHEvent struct {
|
||||||
type backlogEvent struct {
|
type backlogEvent struct {
|
||||||
Log *loglist.Log
|
Log *loglist.Log
|
||||||
LatestSTH *ct.SignedTreeHead
|
LatestSTH *ct.SignedTreeHead
|
||||||
Backlog uint64
|
|
||||||
Position uint64
|
Position uint64
|
||||||
}
|
}
|
||||||
type staleLogListEvent struct {
|
type staleLogListEvent struct {
|
||||||
|
@ -36,6 +82,10 @@ type staleLogListEvent struct {
|
||||||
LastErrorTime time.Time
|
LastErrorTime time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *backlogEvent) Backlog() uint64 {
|
||||||
|
return e.LatestSTH.TreeSize - e.Position
|
||||||
|
}
|
||||||
|
|
||||||
func (e *staleSTHEvent) Environ() []string {
|
func (e *staleSTHEvent) Environ() []string {
|
||||||
return []string{
|
return []string{
|
||||||
"EVENT=error",
|
"EVENT=error",
|
||||||
|
@ -45,7 +95,7 @@ func (e *staleSTHEvent) Environ() []string {
|
||||||
func (e *backlogEvent) Environ() []string {
|
func (e *backlogEvent) Environ() []string {
|
||||||
return []string{
|
return []string{
|
||||||
"EVENT=error",
|
"EVENT=error",
|
||||||
"SUMMARY=" + fmt.Sprintf("backlog of size %d from %s", e.Backlog, e.Log.URL),
|
"SUMMARY=" + fmt.Sprintf("backlog of size %d from %s", e.Backlog(), e.Log.URL),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
func (e *staleLogListEvent) Environ() []string {
|
func (e *staleLogListEvent) Environ() []string {
|
||||||
|
@ -59,7 +109,7 @@ func (e *staleSTHEvent) EmailSubject() string {
|
||||||
return fmt.Sprintf("[certspotter] Unable to contact %s since %s", e.Log.URL, e.LastSuccess)
|
return fmt.Sprintf("[certspotter] Unable to contact %s since %s", e.Log.URL, e.LastSuccess)
|
||||||
}
|
}
|
||||||
func (e *backlogEvent) EmailSubject() string {
|
func (e *backlogEvent) EmailSubject() string {
|
||||||
return fmt.Sprintf("[certspotter] Backlog of size %d from %s", e.Backlog, e.Log.URL)
|
return fmt.Sprintf("[certspotter] Backlog of size %d from %s", e.Backlog(), e.Log.URL)
|
||||||
}
|
}
|
||||||
func (e *staleLogListEvent) EmailSubject() string {
|
func (e *staleLogListEvent) EmailSubject() string {
|
||||||
return fmt.Sprintf("[certspotter] Unable to retrieve log list since %s", e.LastSuccess)
|
return fmt.Sprintf("[certspotter] Unable to retrieve log list since %s", e.LastSuccess)
|
||||||
|
@ -72,7 +122,7 @@ func (e *staleSTHEvent) Text() string {
|
||||||
fmt.Fprintf(text, "For details, see certspotter's stderr output.\n")
|
fmt.Fprintf(text, "For details, see certspotter's stderr output.\n")
|
||||||
fmt.Fprintf(text, "\n")
|
fmt.Fprintf(text, "\n")
|
||||||
if e.LatestSTH != nil {
|
if e.LatestSTH != nil {
|
||||||
fmt.Fprintf(text, "Latest known log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.Timestamp)
|
fmt.Fprintf(text, "Latest known log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.TimestampTime())
|
||||||
} else {
|
} else {
|
||||||
fmt.Fprintf(text, "Latest known log size = none\n")
|
fmt.Fprintf(text, "Latest known log size = none\n")
|
||||||
}
|
}
|
||||||
|
@ -84,9 +134,9 @@ func (e *backlogEvent) Text() string {
|
||||||
fmt.Fprintf(text, "\n")
|
fmt.Fprintf(text, "\n")
|
||||||
fmt.Fprintf(text, "For more details, see certspotter's stderr output.\n")
|
fmt.Fprintf(text, "For more details, see certspotter's stderr output.\n")
|
||||||
fmt.Fprintf(text, "\n")
|
fmt.Fprintf(text, "\n")
|
||||||
fmt.Fprintf(text, "Current log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.Timestamp)
|
fmt.Fprintf(text, "Current log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.TimestampTime())
|
||||||
fmt.Fprintf(text, "Current position = %d\n", e.Position)
|
fmt.Fprintf(text, "Current position = %d\n", e.Position)
|
||||||
fmt.Fprintf(text, " Backlog = %d\n", e.Backlog)
|
fmt.Fprintf(text, " Backlog = %d\n", e.Backlog())
|
||||||
return text.String()
|
return text.String()
|
||||||
}
|
}
|
||||||
func (e *staleLogListEvent) Text() string {
|
func (e *staleLogListEvent) Text() string {
|
||||||
|
|
Loading…
Reference in New Issue