Skip to content

Commit

Permalink
[GraphQL] Collect error types in metrics (#4630)
Browse files Browse the repository at this point in the history
Adds an additional label with the type of error that occurred. Specifically it should help capture which errors are occurring, how frequently, and time elapsed before the resolver encountered an err.

---

Signed-off-by: James Phillips <jamesdphillips@gmail.com>
  • Loading branch information
jamesdphillips authored Feb 16, 2022
1 parent 2f65879 commit 4bdb712
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 53 deletions.
6 changes: 6 additions & 0 deletions backend/apid/graphql/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,12 @@ func NewService(cfg ServiceConfig) (*Service, error) {

// Configure tracing
tracer := tracing.NewPrometheusTracer()
tracer.AllowList = []string{
tracing.KeyParse,
tracing.KeyValidate,
tracing.KeyExecuteQuery,
tracing.KeyExecuteField,
}
svc.RegisterMiddleware(tracer)

err := svc.Regenerate()
Expand Down
31 changes: 22 additions & 9 deletions graphql/tracing/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package tracing

import (
"context"
"fmt"

time "github.com/echlebek/timeproxy"
"github.com/graphql-go/graphql"
Expand All @@ -24,7 +25,7 @@ var (
Help: "Time spent in GraphQL operations, in seconds",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
},
[]string{"key", "platform_key"},
[]string{"key", "platform_key", "err"},
)

noopParse = func(_ error) {}
Expand Down Expand Up @@ -73,9 +74,9 @@ func (c *PrometheusTracer) ParseDidStart(ctx context.Context) (context.Context,
return ctx, noopParse
}
t := time.Now()
return ctx, func(_ error) {
return ctx, func(err error) {
dur := msecSince(t)
met := Collector.WithLabelValues(KeyParse, platformKeys[KeyParse])
met := Collector.WithLabelValues(KeyParse, platformKeys[KeyParse], extractErrType(err))
met.Observe(dur)
}
}
Expand All @@ -86,9 +87,13 @@ func (c *PrometheusTracer) ValidationDidStart(ctx context.Context) (context.Cont
return ctx, noopValidate
}
t := time.Now()
return ctx, func(_ []gqlerrors.FormattedError) {
return ctx, func(errs []gqlerrors.FormattedError) {
dur := msecSince(t)
met := Collector.WithLabelValues(KeyValidate, platformKeys[KeyValidate])
err := "<nil>"
if len(errs) > 0 {
err = extractErrType(errs[0].OriginalError())
}
met := Collector.WithLabelValues(KeyValidate, platformKeys[KeyValidate], err)
met.Observe(dur)
}
}
Expand All @@ -99,9 +104,13 @@ func (c *PrometheusTracer) ExecutionDidStart(ctx context.Context) (context.Conte
return ctx, noopQuery
}
t := time.Now()
return ctx, func(_ *graphql.Result) {
return ctx, func(result *graphql.Result) {
dur := msecSince(t)
met := Collector.WithLabelValues(KeyExecuteQuery, platformKeys[KeyExecuteQuery])
err := "<nil>"
if result != nil && len(result.Errors) > 0 {
err = extractErrType(result.Errors[0].OriginalError())
}
met := Collector.WithLabelValues(KeyExecuteQuery, platformKeys[KeyExecuteQuery], err)
met.Observe(dur)
}
}
Expand All @@ -112,10 +121,10 @@ func (c *PrometheusTracer) ResolveFieldDidStart(ctx context.Context, i *graphql.
return ctx, noopField
}
t := time.Now()
return ctx, func(_ interface{}, _ error) {
return ctx, func(_ interface{}, err error) {
dur := msecSince(t)
key := i.ParentType.Name() + "." + i.FieldName
met := Collector.WithLabelValues(KeyExecuteField, key)
met := Collector.WithLabelValues(KeyExecuteField, key, extractErrType(err))
met.Observe(dur)
}
}
Expand All @@ -136,3 +145,7 @@ func msecSince(t time.Time) float64 {
dur := time.Since(t)
return float64(dur) / float64(time.Millisecond)
}

func extractErrType(err error) string {
return fmt.Sprintf("%T", err)
}
158 changes: 114 additions & 44 deletions graphql/tracing/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/echlebek/crock"
time "github.com/echlebek/timeproxy"
"github.com/graphql-go/graphql"
"github.com/graphql-go/graphql/gqlerrors"
"github.com/prometheus/client_golang/prometheus/testutil"
)

Expand All @@ -25,6 +26,7 @@ func TestPrometheusTracer_ParseDidStart(t *testing.T) {
allowList []string
delta time.Duration
runs int
err error
want string
}{
{
Expand All @@ -40,11 +42,11 @@ func TestPrometheusTracer_ParseDidStart(t *testing.T) {
delta: 200,
runs: 1,
want: summaryMetadata + `
graphql_duration_seconds{key="parse",platform_key="graphql.parse",quantile="0.5"} 200
graphql_duration_seconds{key="parse",platform_key="graphql.parse",quantile="0.9"} 200
graphql_duration_seconds{key="parse",platform_key="graphql.parse",quantile="0.99"} 200
graphql_duration_seconds_sum{key="parse",platform_key="graphql.parse"} 200
graphql_duration_seconds_count{key="parse",platform_key="graphql.parse"} 1
graphql_duration_seconds{err="<nil>",key="parse",platform_key="graphql.parse",quantile="0.5"} 200
graphql_duration_seconds{err="<nil>",key="parse",platform_key="graphql.parse",quantile="0.9"} 200
graphql_duration_seconds{err="<nil>",key="parse",platform_key="graphql.parse",quantile="0.99"} 200
graphql_duration_seconds_sum{err="<nil>",key="parse",platform_key="graphql.parse"} 200
graphql_duration_seconds_count{err="<nil>",key="parse",platform_key="graphql.parse"} 1
`,
},
{
Expand All @@ -53,11 +55,25 @@ func TestPrometheusTracer_ParseDidStart(t *testing.T) {
delta: 20,
runs: 5,
want: summaryMetadata + `
graphql_duration_seconds{key="parse",platform_key="graphql.parse",quantile="0.5"} 20
graphql_duration_seconds{key="parse",platform_key="graphql.parse",quantile="0.9"} 20
graphql_duration_seconds{key="parse",platform_key="graphql.parse",quantile="0.99"} 20
graphql_duration_seconds_sum{key="parse",platform_key="graphql.parse"} 100
graphql_duration_seconds_count{key="parse",platform_key="graphql.parse"} 5
graphql_duration_seconds{err="<nil>",key="parse",platform_key="graphql.parse",quantile="0.5"} 20
graphql_duration_seconds{err="<nil>",key="parse",platform_key="graphql.parse",quantile="0.9"} 20
graphql_duration_seconds{err="<nil>",key="parse",platform_key="graphql.parse",quantile="0.99"} 20
graphql_duration_seconds_sum{err="<nil>",key="parse",platform_key="graphql.parse"} 100
graphql_duration_seconds_count{err="<nil>",key="parse",platform_key="graphql.parse"} 5
`,
},
{
name: "encountered error",
allowList: []string{KeyParse},
delta: 20,
runs: 5,
err: context.DeadlineExceeded,
want: summaryMetadata + `
graphql_duration_seconds{err="context.deadlineExceededError",key="parse",platform_key="graphql.parse",quantile="0.5"} 20
graphql_duration_seconds{err="context.deadlineExceededError",key="parse",platform_key="graphql.parse",quantile="0.9"} 20
graphql_duration_seconds{err="context.deadlineExceededError",key="parse",platform_key="graphql.parse",quantile="0.99"} 20
graphql_duration_seconds_sum{err="context.deadlineExceededError",key="parse",platform_key="graphql.parse"} 100
graphql_duration_seconds_count{err="context.deadlineExceededError",key="parse",platform_key="graphql.parse"} 5
`,
},
}
Expand All @@ -68,7 +84,7 @@ func TestPrometheusTracer_ParseDidStart(t *testing.T) {
for i := 0; i < tt.runs; i++ {
_, fn := trace.ParseDidStart(context.Background())
mockTime.Set(time.Now().Add(tt.delta * time.Millisecond))
fn(nil)
fn(tt.err)
}

if err := testutil.CollectAndCompare(trace.Collector(), strings.NewReader(tt.want), "graphql_duration_seconds"); err != nil {
Expand All @@ -86,6 +102,7 @@ func TestPrometheusTracer_ValidationDidStart(t *testing.T) {
name string
allowList []string
delta time.Duration
err error
runs int
want string
}{
Expand All @@ -102,11 +119,11 @@ func TestPrometheusTracer_ValidationDidStart(t *testing.T) {
delta: 150,
runs: 1,
want: summaryMetadata + `
graphql_duration_seconds{key="validate",platform_key="graphql.validate",quantile="0.5"} 150
graphql_duration_seconds{key="validate",platform_key="graphql.validate",quantile="0.9"} 150
graphql_duration_seconds{key="validate",platform_key="graphql.validate",quantile="0.99"} 150
graphql_duration_seconds_sum{key="validate",platform_key="graphql.validate"} 150
graphql_duration_seconds_count{key="validate",platform_key="graphql.validate"} 1
graphql_duration_seconds{err="<nil>",key="validate",platform_key="graphql.validate",quantile="0.5"} 150
graphql_duration_seconds{err="<nil>",key="validate",platform_key="graphql.validate",quantile="0.9"} 150
graphql_duration_seconds{err="<nil>",key="validate",platform_key="graphql.validate",quantile="0.99"} 150
graphql_duration_seconds_sum{err="<nil>",key="validate",platform_key="graphql.validate"} 150
graphql_duration_seconds_count{err="<nil>",key="validate",platform_key="graphql.validate"} 1
`,
},
{
Expand All @@ -115,11 +132,25 @@ func TestPrometheusTracer_ValidationDidStart(t *testing.T) {
delta: 15,
runs: 8,
want: summaryMetadata + `
graphql_duration_seconds{key="validate",platform_key="graphql.validate",quantile="0.5"} 15
graphql_duration_seconds{key="validate",platform_key="graphql.validate",quantile="0.9"} 15
graphql_duration_seconds{key="validate",platform_key="graphql.validate",quantile="0.99"} 15
graphql_duration_seconds_sum{key="validate",platform_key="graphql.validate"} 120
graphql_duration_seconds_count{key="validate",platform_key="graphql.validate"} 8
graphql_duration_seconds{err="<nil>",key="validate",platform_key="graphql.validate",quantile="0.5"} 15
graphql_duration_seconds{err="<nil>",key="validate",platform_key="graphql.validate",quantile="0.9"} 15
graphql_duration_seconds{err="<nil>",key="validate",platform_key="graphql.validate",quantile="0.99"} 15
graphql_duration_seconds_sum{err="<nil>",key="validate",platform_key="graphql.validate"} 120
graphql_duration_seconds_count{err="<nil>",key="validate",platform_key="graphql.validate"} 8
`,
},
{
name: "encountered error",
allowList: []string{KeyValidate},
delta: 15,
runs: 8,
err: context.DeadlineExceeded,
want: summaryMetadata + `
graphql_duration_seconds{err="context.deadlineExceededError",key="validate",platform_key="graphql.validate",quantile="0.5"} 15
graphql_duration_seconds{err="context.deadlineExceededError",key="validate",platform_key="graphql.validate",quantile="0.9"} 15
graphql_duration_seconds{err="context.deadlineExceededError",key="validate",platform_key="graphql.validate",quantile="0.99"} 15
graphql_duration_seconds_sum{err="context.deadlineExceededError",key="validate",platform_key="graphql.validate"} 120
graphql_duration_seconds_count{err="context.deadlineExceededError",key="validate",platform_key="graphql.validate"} 8
`,
},
}
Expand All @@ -130,7 +161,12 @@ func TestPrometheusTracer_ValidationDidStart(t *testing.T) {
for i := 0; i < tt.runs; i++ {
_, fn := trace.ValidationDidStart(context.Background())
mockTime.Set(time.Now().Add(tt.delta * time.Millisecond))
fn(nil)

var errs []gqlerrors.FormattedError
if tt.err != nil {
errs = gqlerrors.FormatErrors(tt.err)
}
fn(errs)
}

if err := testutil.CollectAndCompare(trace.Collector(), strings.NewReader(tt.want), "graphql_duration_seconds"); err != nil {
Expand All @@ -149,6 +185,7 @@ func TestPrometheusTracer_ExecutionDidStart(t *testing.T) {
allowList []string
delta time.Duration
runs int
err error
want string
}{
{
Expand All @@ -164,11 +201,11 @@ func TestPrometheusTracer_ExecutionDidStart(t *testing.T) {
delta: 120,
runs: 1,
want: summaryMetadata + `
graphql_duration_seconds{key="execute_query",platform_key="graphql.execute",quantile="0.5"} 120
graphql_duration_seconds{key="execute_query",platform_key="graphql.execute",quantile="0.9"} 120
graphql_duration_seconds{key="execute_query",platform_key="graphql.execute",quantile="0.99"} 120
graphql_duration_seconds_sum{key="execute_query",platform_key="graphql.execute"} 120
graphql_duration_seconds_count{key="execute_query",platform_key="graphql.execute"} 1
graphql_duration_seconds{err="<nil>",key="execute_query",platform_key="graphql.execute",quantile="0.5"} 120
graphql_duration_seconds{err="<nil>",key="execute_query",platform_key="graphql.execute",quantile="0.9"} 120
graphql_duration_seconds{err="<nil>",key="execute_query",platform_key="graphql.execute",quantile="0.99"} 120
graphql_duration_seconds_sum{err="<nil>",key="execute_query",platform_key="graphql.execute"} 120
graphql_duration_seconds_count{err="<nil>",key="execute_query",platform_key="graphql.execute"} 1
`,
},
{
Expand All @@ -177,11 +214,24 @@ func TestPrometheusTracer_ExecutionDidStart(t *testing.T) {
delta: 12,
runs: 9,
want: summaryMetadata + `
graphql_duration_seconds{key="execute_query",platform_key="graphql.execute",quantile="0.5"} 12
graphql_duration_seconds{key="execute_query",platform_key="graphql.execute",quantile="0.9"} 12
graphql_duration_seconds{key="execute_query",platform_key="graphql.execute",quantile="0.99"} 12
graphql_duration_seconds_sum{key="execute_query",platform_key="graphql.execute"} 108
graphql_duration_seconds_count{key="execute_query",platform_key="graphql.execute"} 9
graphql_duration_seconds{err="<nil>",key="execute_query",platform_key="graphql.execute",quantile="0.5"} 12
graphql_duration_seconds{err="<nil>",key="execute_query",platform_key="graphql.execute",quantile="0.9"} 12
graphql_duration_seconds{err="<nil>",key="execute_query",platform_key="graphql.execute",quantile="0.99"} 12
graphql_duration_seconds_sum{err="<nil>",key="execute_query",platform_key="graphql.execute"} 108
graphql_duration_seconds_count{err="<nil>",key="execute_query",platform_key="graphql.execute"} 9
`,
}, {
name: "encountered error",
allowList: []string{KeyExecuteQuery},
delta: 12,
runs: 9,
err: context.DeadlineExceeded,
want: summaryMetadata + `
graphql_duration_seconds{err="context.deadlineExceededError",key="execute_query",platform_key="graphql.execute",quantile="0.5"} 12
graphql_duration_seconds{err="context.deadlineExceededError",key="execute_query",platform_key="graphql.execute",quantile="0.9"} 12
graphql_duration_seconds{err="context.deadlineExceededError",key="execute_query",platform_key="graphql.execute",quantile="0.99"} 12
graphql_duration_seconds_sum{err="context.deadlineExceededError",key="execute_query",platform_key="graphql.execute"} 108
graphql_duration_seconds_count{err="context.deadlineExceededError",key="execute_query",platform_key="graphql.execute"} 9
`,
},
}
Expand All @@ -192,7 +242,12 @@ func TestPrometheusTracer_ExecutionDidStart(t *testing.T) {
for i := 0; i < tt.runs; i++ {
_, fn := trace.ExecutionDidStart(context.Background())
mockTime.Set(time.Now().Add(tt.delta * time.Millisecond))
fn(nil)

result := &graphql.Result{}
if tt.err != nil {
result.Errors = gqlerrors.FormatErrors(tt.err)
}
fn(result)
}

if err := testutil.CollectAndCompare(trace.Collector(), strings.NewReader(tt.want), "graphql_duration_seconds"); err != nil {
Expand All @@ -211,6 +266,7 @@ func TestPrometheusTracer_ResolveFieldDidStart(t *testing.T) {
allowList []string
delta time.Duration
runs int
err error
want string
}{
{
Expand All @@ -226,11 +282,11 @@ func TestPrometheusTracer_ResolveFieldDidStart(t *testing.T) {
delta: 120,
runs: 1,
want: summaryMetadata + `
graphql_duration_seconds{key="execute_field",platform_key="Check.interval",quantile="0.5"} 120
graphql_duration_seconds{key="execute_field",platform_key="Check.interval",quantile="0.9"} 120
graphql_duration_seconds{key="execute_field",platform_key="Check.interval",quantile="0.99"} 120
graphql_duration_seconds_sum{key="execute_field",platform_key="Check.interval"} 120
graphql_duration_seconds_count{key="execute_field",platform_key="Check.interval"} 1
graphql_duration_seconds{err="<nil>",key="execute_field",platform_key="Check.interval",quantile="0.5"} 120
graphql_duration_seconds{err="<nil>",key="execute_field",platform_key="Check.interval",quantile="0.9"} 120
graphql_duration_seconds{err="<nil>",key="execute_field",platform_key="Check.interval",quantile="0.99"} 120
graphql_duration_seconds_sum{err="<nil>",key="execute_field",platform_key="Check.interval"} 120
graphql_duration_seconds_count{err="<nil>",key="execute_field",platform_key="Check.interval"} 1
`,
},
{
Expand All @@ -239,11 +295,25 @@ func TestPrometheusTracer_ResolveFieldDidStart(t *testing.T) {
delta: 15,
runs: 7,
want: summaryMetadata + `
graphql_duration_seconds{key="execute_field",platform_key="Check.interval",quantile="0.5"} 15
graphql_duration_seconds{key="execute_field",platform_key="Check.interval",quantile="0.9"} 15
graphql_duration_seconds{key="execute_field",platform_key="Check.interval",quantile="0.99"} 15
graphql_duration_seconds_sum{key="execute_field",platform_key="Check.interval"} 105
graphql_duration_seconds_count{key="execute_field",platform_key="Check.interval"} 7
graphql_duration_seconds{err="<nil>",key="execute_field",platform_key="Check.interval",quantile="0.5"} 15
graphql_duration_seconds{err="<nil>",key="execute_field",platform_key="Check.interval",quantile="0.9"} 15
graphql_duration_seconds{err="<nil>",key="execute_field",platform_key="Check.interval",quantile="0.99"} 15
graphql_duration_seconds_sum{err="<nil>",key="execute_field",platform_key="Check.interval"} 105
graphql_duration_seconds_count{err="<nil>",key="execute_field",platform_key="Check.interval"} 7
`,
},
{
name: "encountered error",
allowList: []string{KeyExecuteField},
delta: 15,
runs: 7,
err: context.DeadlineExceeded,
want: summaryMetadata + `
graphql_duration_seconds{err="context.deadlineExceededError",key="execute_field",platform_key="Check.interval",quantile="0.5"} 15
graphql_duration_seconds{err="context.deadlineExceededError",key="execute_field",platform_key="Check.interval",quantile="0.9"} 15
graphql_duration_seconds{err="context.deadlineExceededError",key="execute_field",platform_key="Check.interval",quantile="0.99"} 15
graphql_duration_seconds_sum{err="context.deadlineExceededError",key="execute_field",platform_key="Check.interval"} 105
graphql_duration_seconds_count{err="context.deadlineExceededError",key="execute_field",platform_key="Check.interval"} 7
`,
},
}
Expand All @@ -258,7 +328,7 @@ func TestPrometheusTracer_ResolveFieldDidStart(t *testing.T) {
for i := 0; i < tt.runs; i++ {
_, fn := trace.ResolveFieldDidStart(context.Background(), info)
mockTime.Set(time.Now().Add(tt.delta * time.Millisecond))
fn(nil, nil)
fn(nil, tt.err)
}

if err := testutil.CollectAndCompare(trace.Collector(), strings.NewReader(tt.want), "graphql_duration_seconds"); err != nil {
Expand Down

0 comments on commit 4bdb712

Please sign in to comment.