Skip to content

Commit

Permalink
Better tracking of failed requests + logging context exclude (#485)
Browse files Browse the repository at this point in the history
- add --logExcludeContext for log contexts that should be excluded
(while --logContext specifies which are to be included)
- enable 'recorderNetwork' logging for debugging CDP network
- create default log context exclude list (containing: screencast,
recorderNetwork, jsErrors), customizable via --logExcludeContext

recorder: Track failed requests and include in pageinfo records with
status code 0
- cleanup cdp handler methods
- intercept requestWillBeSent to track requests that started (but may
not complete)
- fix shouldSkip() still working if no url is provided (eg. check only
headers)
- set status to 0 for async fetch failures
- remove responseServedFromCache interception, as response data
generally not available then, and responseReceived is still called
- pageinfo: include page requests that failed with status code 0, also
include 'error' status if available.
- ensure page is closed on failure
- ensure pageinfo still written even if nothing else is crawled for a
page
- track cached responses, add to debug logging (can also add to pageinfo
later if needed)

tests: add pageinfo test for crawling invalid URL, which should still
result in pageinfo record with status code 0

bump to 1.0.0-beta.7
  • Loading branch information
ikreymer authored Mar 7, 2024
1 parent 65133c9 commit 9f18a49
Show file tree
Hide file tree
Showing 8 changed files with 298 additions and 130 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.0.0-beta.6",
"version": "1.0.0-beta.7",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
3 changes: 2 additions & 1 deletion src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@ export class Crawler {
const debugLogging = this.params.logging.includes("debug");
logger.setDebugLogging(debugLogging);
logger.setLogLevel(this.params.logLevel);
logger.setContext(this.params.context);
logger.setContext(this.params.logContext);
logger.setExcludeContext(this.params.logExcludeContext);

// if automatically restarts on error exit code,
// exit with 0 from fatal by default, to avoid unnecessary restart
Expand Down
14 changes: 13 additions & 1 deletion src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,11 @@ import {
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
import { screenshotTypes } from "./screenshots.js";
import { LOG_CONTEXT_TYPES, logger } from "./logger.js";
import {
DEFAULT_EXCLUDE_LOG_CONTEXTS,
LOG_CONTEXT_TYPES,
logger,
} from "./logger.js";

// ============================================================================
class ArgParser {
Expand Down Expand Up @@ -225,6 +229,14 @@ class ArgParser {
coerce,
},

logExcludeContext: {
describe: "Comma-separated list of contexts to NOT include in logs",
type: "array",
default: DEFAULT_EXCLUDE_LOG_CONTEXTS,
choices: LOG_CONTEXT_TYPES,
coerce,
},

text: {
describe:
"Extract initial (default) or final text to pages.jsonl or WARC resource record(s)",
Expand Down
24 changes: 21 additions & 3 deletions src/util/logger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export const LOG_CONTEXT_TYPES = [
"general",
"worker",
"recorder",
"recorderNetwork",
"writer",
"state",
"redis",
Expand All @@ -51,13 +52,20 @@ export const LOG_CONTEXT_TYPES = [

export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];

export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [
"recorderNetwork",
"jsError",
"screencast",
];

// ===========================================================================
class Logger {
logStream: Writable | null = null;
debugLogging = false;
logErrorsToRedis = false;
logLevels: string[] = [];
contexts: string[] = [];
contexts: LogContext[] = [];
excludeContexts: LogContext[] = [];
crawlState?: RedisCrawlState | null = null;
fatalExitCode = 17;

Expand All @@ -81,18 +89,22 @@ class Logger {
this.logLevels = logLevels;
}

setContext(contexts: string[]) {
setContext(contexts: LogContext[]) {
this.contexts = contexts;
}

setExcludeContext(contexts: LogContext[]) {
this.excludeContexts = contexts;
}

setCrawlState(crawlState: RedisCrawlState) {
this.crawlState = crawlState;
}

logAsJSON(
message: string,
dataUnknown: unknown,
context: string,
context: LogContext,
logLevel = "info",
) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
Expand All @@ -110,6 +122,12 @@ class Logger {
}
}

if (this.excludeContexts.length) {
if (this.excludeContexts.indexOf(context) >= 0) {
return;
}
}

const dataToLog = {
timestamp: new Date().toISOString(),
logLevel: logLevel,
Expand Down
Loading

0 comments on commit 9f18a49

Please sign in to comment.