Skip to content

Commit

Permalink
feat: limiting processing the response body via limitBodyToNBytes w…
Browse files Browse the repository at this point in the history
…hen `searchForBodyPatterns==true`

take 2
  • Loading branch information
d-led committed Sep 17, 2024
1 parent e84fe12 commit 169ec84
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 2 deletions.
1 change: 1 addition & 0 deletions .link-checker-service.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ regex = "Login Service"

[HTTPClient]
maxRedirectsCount = 15
limitBodyToNBytes = 10000000000
timeoutSeconds = 45
userAgent = "lcs/0.9"
browserUserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
Expand Down
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

Notable changes will be documented here

## 0.9.37

- limiting processing the response body via `limitBodyToNBytes` when `searchForBodyPatterns==true`
- upgraded dependencies
- Go v1.23

## 0.9.36

- upgraded dependencies
Expand Down
3 changes: 3 additions & 0 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ const (
proxyKey = "proxy"
pacScriptURLKey = "pacScriptURL"
maxRedirectsCountKey = "maxRedirectsCount"
limitBodyToNBytesKey = "limitBodyToNBytes"
timeoutSecondsKey = "timeoutSeconds"
userAgentKey = "userAgent"
browserUserAgentKey = "browserUserAgent"
Expand Down Expand Up @@ -102,6 +103,8 @@ func init() {
_ = viper.BindPFlag(httpClientMapKey+skipCertificateCheckKey, rootCmd.PersistentFlags().Lookup(skipCertificateCheckKey))
rootCmd.PersistentFlags().Bool(enableRequestTracingKey, false, "HTTP client: enable request tracing")
_ = viper.BindPFlag(httpClientMapKey+enableRequestTracingKey, rootCmd.PersistentFlags().Lookup(enableRequestTracingKey))
rootCmd.PersistentFlags().Uint(limitBodyToNBytesKey, 0, "HTTP client: maximum number of bytes to read from the body when searching for patterns. Unlimited if 0!")
_ = viper.BindPFlag(httpClientMapKey+limitBodyToNBytesKey, rootCmd.PersistentFlags().Lookup(limitBodyToNBytesKey))
// service
rootCmd.PersistentFlags().UintP(maxConcurrentHTTPRequestsKey, "c", 256, "maximum number of total concurrent HTTP requests")
_ = viper.BindPFlag(maxConcurrentHTTPRequestsKey, rootCmd.PersistentFlags().Lookup(maxConcurrentHTTPRequestsKey))
Expand Down
57 changes: 56 additions & 1 deletion infrastructure/url_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"context"
"crypto/tls"
"fmt"
"io"
"log"
"net"
"net/http"
Expand All @@ -30,6 +31,7 @@ import (
"github.com/go-resty/resty/v2"
)

const defaultLimitBodyToNBytes = 0
const defaultMaxRedirectsCount = 15
const defaultTimeoutSeconds = 10
const defaultUserAgent = "lcs/0.9"
Expand Down Expand Up @@ -72,6 +74,7 @@ type urlCheckerSettings struct {
EnableRequestTracing bool
URLCheckerPlugins []string
PacScriptURL string
LimitBodyToNBytes uint
}

// URLChecker interface that all layers should conform to
Expand Down Expand Up @@ -203,6 +206,7 @@ func getURLCheckerSettings() urlCheckerSettings {
UserAgent: defaultUserAgent,
BrowserUserAgent: defaultBrowserUserAgent,
AcceptHeader: defaultAcceptHeader,
LimitBodyToNBytes: defaultLimitBodyToNBytes,
}

if proxyURL := viper.GetString("proxy"); proxyURL != "" {
Expand All @@ -220,6 +224,7 @@ func getURLCheckerSettings() urlCheckerSettings {
}

s.MaxRedirectsCount = viper.GetUint("HTTPClient.maxRedirectsCount")
s.LimitBodyToNBytes = viper.GetUint("HTTPClient.limitBodyToNBytes")
s.TimeoutSeconds = viper.GetUint("HTTPClient.timeoutSeconds")
if v := viper.GetString("HTTPClient.userAgent"); v != "" {
s.UserAgent = v
Expand All @@ -240,6 +245,7 @@ func getURLCheckerSettings() urlCheckerSettings {
log.Printf("HTTP client AcceptHeader: %v", s.AcceptHeader)
log.Printf("HTTP client SkipCertificateCheck: %v", s.SkipCertificateCheck)
log.Printf("HTTP client EnableRequestTracing: %v", s.EnableRequestTracing)
log.Printf("HTTP client LimitBodyToNBytes: %v", s.LimitBodyToNBytes)

// advanced configuration feature: only configurable via the config file
s.SearchForBodyPatterns = viper.GetBool("searchForBodyPatterns")
Expand Down Expand Up @@ -485,11 +491,12 @@ func (c *URLCheckerClient) tryGetRequestAndProcessResponseBody(ctx context.Conte
response, err := client.R().
SetHeader("Accept", c.settings.AcceptHeader).
SetContext(ctx).
SetDoNotParseResponse(true).
SetHeader("User-Agent", c.settings.BrowserUserAgent).
Get(urlToCheck)
res = c.processResponse(urlToCheck, response, err)
if c.settings.SearchForBodyPatterns && response != nil {
body = response.String()
body = c.limitedBody(response)
}
}

Expand Down Expand Up @@ -641,6 +648,54 @@ func (c *URLCheckerClient) tryHeadRequestAsBrowserIfForbidden(ctx context.Contex
return res
}

func (c *URLCheckerClient) limitedBody(response *resty.Response) string {
body := response.RawBody()
defer body.Close()
return safelyTrimmedStream(body, c.settings.LimitBodyToNBytes)
}

func safelyTrimmedStream(input io.Reader, limit uint) string {
res := []byte{}
if limit == 0 {
b, err := io.ReadAll(input)
if err != nil {
if b != nil {
res = b
}
return string(safelyTrimmedString(res, limit))
}
return string(b)
}

const bufferSize = 1024
b := [bufferSize]byte{}
bytesRead := 0
for {
n, err := input.Read(b[:])

if err != nil {
// first append bytes read so far
res = append(res, b[:n]...)
return string(safelyTrimmedString(res, limit))
}

res = append(res, b[:n]...)
bytesRead += n

if uint(bytesRead) >= limit {
break
}
}
return string(safelyTrimmedString(res, limit))
}

func safelyTrimmedString(s []byte, limit uint) []byte {
if limit == 0 || len(s) <= int(limit) {
return s
}
return s[:limit]
}

func buildClient(settings urlCheckerSettings) *resty.Client {
client := resty.New()
client.SetTimeout(time.Second * time.Duration(settings.TimeoutSeconds))
Expand Down
107 changes: 106 additions & 1 deletion infrastructure/url_checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,15 @@ package infrastructure

import (
"context"
"errors"
"fmt"
"github.com/stretchr/testify/require"
"io"
"log"
"net/http"
"net/http/httptest"
"os"
"strings"
"testing"
"time"

Expand All @@ -36,10 +43,11 @@ func TestOkUrls(t *testing.T) {
func TestSearchingForBodyPatterns(t *testing.T) {
setUpViperTestConfiguration()
viper.Set("searchForBodyPatterns", true)
viper.Set("HTTPClient.limitBodyToNBytes", uint(0))
res := NewURLCheckerClient().CheckURL(context.Background(), "https://google.com")
assert.Nil(t, res.Error)
assert.Equal(t, http.StatusOK, res.Code)
assert.Len(t, res.BodyPatternsFound, 1)
require.Contains(t, res.BodyPatternsFound, "google")
assert.Equal(t, "google", res.BodyPatternsFound[0], "should have found at least one mention of google")
}

Expand Down Expand Up @@ -71,13 +79,16 @@ func setUpViperTestConfiguration() {
viper.Set("HTTPClient.timeoutSeconds", uint(15))
viper.Set("HTTPClient.maxRedirectsCount", uint(15))
viper.Set("HTTPClient.enableRequestTracing", false)
viper.Set("HTTPClient.limitBodyToNBytes", uint(0))
viper.Set("searchForBodyPatterns", false)
viper.Set("urlCheckerPlugins", []string{})
patterns := []struct {
Name string
Regex string
}{
{"google", "google"},
{"start-a", "start-a"},
{"ab", "ab"},
}
viper.Set("bodyPatterns", patterns)
}
Expand Down Expand Up @@ -143,3 +154,97 @@ func TestResponseTimeout(t *testing.T) {
assert.NotNil(t, res.Error, "the response should have failed due to the abort")
assert.NotEqual(t, http.StatusOK, res.Code)
}

const startChunk = "start-"

var testStringToLimit = startChunk +
strings.Repeat("a", 300) +
strings.Repeat("b", 300)

func TestLimitingBodyReading(t *testing.T) {
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = fmt.Fprintln(w,
testStringToLimit)
}))
log.Println("Test server started at:", ts.URL)
defer ts.Close()
setUpViperTestConfiguration()
viper.Set("searchForBodyPatterns", true)
viper.Set("HTTPClient.limitBodyToNBytes", uint(100))
res := NewURLCheckerClient().CheckURL(context.Background(), ts.URL)
assert.Equal(t, http.StatusOK, res.Code)
assert.Contains(t, res.BodyPatternsFound, "start-a")
assert.NotContains(
t,
res.BodyPatternsFound,
"ab",
"the repeated 'b' part of the message should have not been processed",
)
}

func Test_safelyTrimmedStream(t *testing.T) {
t.Run("limiting empty input produces empty string", func(t *testing.T) {
assert.Equal(t, "", safelyTrimmedStream(streamOf(""), 10))
})

t.Run("non-empty input is not limited if no limit configured", func(t *testing.T) {
assert.Equal(t, testStringToLimit, safelyTrimmedStream(streamOf(testStringToLimit), 0))
})

t.Run("limiting input to a size smaller than a chunk returns string of the limit length",
func(t *testing.T) {
assert.Equal(t, startChunk, safelyTrimmedStream(streamOf(testStringToLimit), uint(len(startChunk))))
})

t.Run("limiting input to a size larger than itself returns the original string",
func(t *testing.T) {
assert.Equal(t, testStringToLimit, safelyTrimmedStream(streamOf(testStringToLimit), 2000))
})

t.Run("limiting input to one byte results in one character",
func(t *testing.T) {
assert.Equal(t, 1, len(safelyTrimmedStream(streamOf(testStringToLimit), 1)))
})

t.Run("limiting input larger than the the buffer (1kB) to a limit larger than the buffer trims the input",
func(t *testing.T) {
assert.Equal(t, 1200, len(safelyTrimmedStream(streamOf(
strings.Repeat(testStringToLimit, 2),
), 1200)))
})

t.Run("trimming the errored stream returns the input processed", func(t *testing.T) {
assert.Equal(t, "abc", safelyTrimmedStream(faultyReaderOf(
"abc,d", 3,
), 10))
})

t.Run("untrimmed errored stream returns the input processed", func(t *testing.T) {
assert.Equal(t, "abc", safelyTrimmedStream(faultyReaderOf(
"abc,d", 3,
), 0))
})
}

type faultyReader struct {
input string
errorAt int
}

func (f *faultyReader) Read(p []byte) (int, error) {
for i := 0; i < f.errorAt; i++ {
p[i] = f.input[i]
}
return f.errorAt, errors.New("expected fault")
}

func faultyReaderOf(s string, i int) io.Reader {
return &faultyReader{
input: s,
errorAt: i,
}
}

func streamOf(s string) io.Reader {
return strings.NewReader(s)
}

0 comments on commit 169ec84

Please sign in to comment.