diff --git a/monitor/daemon.go b/monitor/daemon.go index 2a11776..bcd9ba6 100644 --- a/monitor/daemon.go +++ b/monitor/daemon.go @@ -35,6 +35,7 @@ func reloadLogListInterval() time.Duration { } type task struct { + log *loglist.Log stop context.CancelFunc } @@ -44,9 +45,27 @@ type daemon struct { tasks map[LogID]task logsLoadedAt time.Time logListToken *loglist.ModificationToken + logListError string + logListErrorAt time.Time } -func (daemon *daemon) healthCheck(ctx context.Context) error { // TODO-2 +func (daemon *daemon) healthCheck(ctx context.Context) error { + if time.Since(daemon.logsLoadedAt) >= healthCheckInterval { + if err := notify(ctx, daemon.config, &staleLogListEvent{ + Source: daemon.config.LogListSource, + LastSuccess: daemon.logsLoadedAt, + LastError: daemon.logListError, + LastErrorTime: daemon.logListErrorAt, + }); err != nil { + return fmt.Errorf("error notifying about stale log list: %w", err) + } + } + + for _, task := range daemon.tasks { + if err := healthCheckLog(ctx, daemon.config, task.log); err != nil { + return fmt.Errorf("error checking health of log %q: %w", task.log.URL, err) + } + } return nil } @@ -64,7 +83,7 @@ func (daemon *daemon) startTask(ctx context.Context, ctlog *loglist.Log) task { return fmt.Errorf("error while monitoring %s: %w", ctlog.URL, err) } }) - return task{stop: cancel} + return task{log: ctlog, stop: cancel} } func (daemon *daemon) loadLogList(ctx context.Context) error { @@ -124,6 +143,8 @@ func (daemon *daemon) run(ctx context.Context) error { case <-ctx.Done(): case <-reloadLogListTicker.C: if err := daemon.loadLogList(ctx); err != nil { + daemon.logListError = err.Error() + daemon.logListErrorAt = time.Now() recordError(fmt.Errorf("error reloading log list (will try again later): %w", err)) } reloadLogListTicker.Reset(reloadLogListInterval()) diff --git a/monitor/healthcheck.go b/monitor/healthcheck.go index 9e5e79e..7b83194 100644 --- a/monitor/healthcheck.go +++ b/monitor/healthcheck.go @@ -10,7 +10,11 @@ package monitor import ( + "context" + "errors" "fmt" + "io/fs" + "path/filepath" "strings" "time" @@ -18,6 +22,49 @@ import ( "software.sslmate.com/src/certspotter/loglist" ) +func healthCheckLog(ctx context.Context, config *Config, ctlog *loglist.Log) error { + var ( + stateDirPath = filepath.Join(config.StateDir, "logs", ctlog.LogID.Base64URLString()) + stateFilePath = filepath.Join(stateDirPath, "state.json") + sthsDirPath = filepath.Join(stateDirPath, "unverified_sths") + ) + state, err := loadStateFile(stateFilePath) + if errors.Is(err, fs.ErrNotExist) { + return nil + } else if err != nil { + return fmt.Errorf("error loading state file: %w", err) + } + + if time.Since(state.LastSuccess) < healthCheckInterval { + return nil + } + + sths, err := loadSTHsFromDir(sthsDirPath) + if err != nil { + return fmt.Errorf("error loading STHs directory: %w", err) + } + + if len(sths) == 0 { + if err := notify(ctx, config, &staleSTHEvent{ + Log: ctlog, + LastSuccess: state.LastSuccess, + LatestSTH: state.VerifiedSTH, + }); err != nil { + return fmt.Errorf("error notifying about stale STH: %w", err) + } + } else { + if err := notify(ctx, config, &backlogEvent{ + Log: ctlog, + LatestSTH: sths[len(sths)-1], + Position: state.DownloadPosition.Size(), + }); err != nil { + return fmt.Errorf("error notifying about backlog: %w", err) + } + } + + return nil +} + type staleSTHEvent struct { Log *loglist.Log LastSuccess time.Time @@ -26,7 +73,6 @@ type staleSTHEvent struct { type backlogEvent struct { Log *loglist.Log LatestSTH *ct.SignedTreeHead - Backlog uint64 Position uint64 } type staleLogListEvent struct { @@ -36,6 +82,10 @@ type staleLogListEvent struct { LastErrorTime time.Time } +func (e *backlogEvent) Backlog() uint64 { + return e.LatestSTH.TreeSize - e.Position +} + func (e *staleSTHEvent) Environ() []string { return []string{ "EVENT=error", @@ -45,7 +95,7 @@ func (e *staleSTHEvent) Environ() []string { func (e *backlogEvent) Environ() []string { return []string{ "EVENT=error", - "SUMMARY=" + fmt.Sprintf("backlog of size %d from %s", e.Backlog, e.Log.URL), + "SUMMARY=" + fmt.Sprintf("backlog of size %d from %s", e.Backlog(), e.Log.URL), } } func (e *staleLogListEvent) Environ() []string { @@ -59,7 +109,7 @@ func (e *staleSTHEvent) EmailSubject() string { return fmt.Sprintf("[certspotter] Unable to contact %s since %s", e.Log.URL, e.LastSuccess) } func (e *backlogEvent) EmailSubject() string { - return fmt.Sprintf("[certspotter] Backlog of size %d from %s", e.Backlog, e.Log.URL) + return fmt.Sprintf("[certspotter] Backlog of size %d from %s", e.Backlog(), e.Log.URL) } func (e *staleLogListEvent) EmailSubject() string { return fmt.Sprintf("[certspotter] Unable to retrieve log list since %s", e.LastSuccess) @@ -72,7 +122,7 @@ func (e *staleSTHEvent) Text() string { fmt.Fprintf(text, "For details, see certspotter's stderr output.\n") fmt.Fprintf(text, "\n") if e.LatestSTH != nil { - fmt.Fprintf(text, "Latest known log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.Timestamp) + fmt.Fprintf(text, "Latest known log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.TimestampTime()) } else { fmt.Fprintf(text, "Latest known log size = none\n") } @@ -84,9 +134,9 @@ func (e *backlogEvent) Text() string { fmt.Fprintf(text, "\n") fmt.Fprintf(text, "For more details, see certspotter's stderr output.\n") fmt.Fprintf(text, "\n") - fmt.Fprintf(text, "Current log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.Timestamp) + fmt.Fprintf(text, "Current log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.TimestampTime()) fmt.Fprintf(text, "Current position = %d\n", e.Position) - fmt.Fprintf(text, " Backlog = %d\n", e.Backlog) + fmt.Fprintf(text, " Backlog = %d\n", e.Backlog()) return text.String() } func (e *staleLogListEvent) Text() string {