From baba0b4da58af8302c1cc4ec1d43c1f22d3b8b48 Mon Sep 17 00:00:00 2001
From: undergroundwires <git@undergroundwires.dev>
Date: Wed, 13 Mar 2024 11:34:19 +0100
Subject: [PATCH] ci/cd: trigger URL checks more, and limit amount

Fix all URL checks failing in GitHub runner due to:

- Missing Happy Eyeballs in Node.js
  nodejs/undici$1531
  nodejs/node$41625
- Or is it due too to many max connections? Test this.
  Mentioned in comment nodejs/node$41625.

Key changes:

- Run URL checks more frequently on changes, every time a collection
  file or test code is changed.
- Introduce environment variable to randomly select and limit URLs
  tested, this way the tests will provide quicker feedback on code
  changes.

Other supporting changes:

- Log more information about test before running the test to enable
  easier troubleshooting.
- Move shuffle function for arrays for reusability and missing tests.
---
 .github/workflows/checks.external-urls.yaml   |  8 ++
 src/application/Common/Shuffle.ts             | 12 +++
 .../DocumentationUrlExtractor.ts              | 69 ++++++++++++++
 .../TestExecutionDetailsLogger.ts             | 26 ++++++
 tests/checks/external-urls/main.spec.ts       | 92 +++++++++++--------
 tests/unit/application/Common/Shuffle.spec.ts | 52 +++++++++++
 6 files changed, 221 insertions(+), 38 deletions(-)
 create mode 100644 src/application/Common/Shuffle.ts
 create mode 100644 tests/checks/external-urls/DocumentationUrlExtractor.ts
 create mode 100644 tests/checks/external-urls/TestExecutionDetailsLogger.ts
 create mode 100644 tests/unit/application/Common/Shuffle.spec.ts
diff --git a/.github/workflows/checks.external-urls.yaml b/.github/workflows/checks.external-urls.yaml
index 8d8b46c85..81baa0ad4 100644
--- a/.github/workflows/checks.external-urls.yaml
+++ b/.github/workflows/checks.external-urls.yaml
@@ -3,6 +3,10 @@ name: checks.external-urls
 on:
   schedule:
     - cron: '0 0 * * 0' # at 00:00 on every Sunday
+  push:
+    paths:
+      - tests/checks/external-urls/**
+      - src/application/collections/**
 
 jobs:
   run-check:
@@ -20,3 +24,7 @@ jobs:
       -
         name: Test
         run: npm run check:external-urls
+        env:
+          RANDOMIZED_URL_CHECK_LIMIT: "${{ github.event_name == 'push' && '100' || '' }}"
+          # - Scheduled checks has no limits, ensuring thorough testing.
+          # - For push events, triggered by code changes, the amount of URLs are limited to provide quick feedback.
diff --git a/src/application/Common/Shuffle.ts b/src/application/Common/Shuffle.ts
new file mode 100644
index 000000000..94df09bd0
--- /dev/null
+++ b/src/application/Common/Shuffle.ts
@@ -0,0 +1,12 @@
+/*
+  Shuffle an array of strings, returning a new array with elements in random order.
+  Uses the Fisher-Yates (or Durstenfeld) algorithm.
+*/
+export function shuffle<T>(array: readonly T[]): T[] {
+  const shuffledArray = [...array];
+  for (let i = array.length - 1; i > 0; i--) {
+    const j = Math.floor(Math.random() * (i + 1));
+    [shuffledArray[i], shuffledArray[j]] = [shuffledArray[j], shuffledArray[i]];
+  }
+  return shuffledArray;
+}
diff --git a/tests/checks/external-urls/DocumentationUrlExtractor.ts b/tests/checks/external-urls/DocumentationUrlExtractor.ts
new file mode 100644
index 000000000..7b53b0d12
--- /dev/null
+++ b/tests/checks/external-urls/DocumentationUrlExtractor.ts
@@ -0,0 +1,69 @@
+import type { IApplication } from '@/domain/IApplication';
+import type { TestExecutionDetailsLogger } from './TestExecutionDetailsLogger';
+
+interface UrlExtractionContext {
+  readonly logger: TestExecutionDetailsLogger;
+  readonly application: IApplication;
+  readonly urlExclusionPatterns: readonly RegExp[];
+}
+
+export function extractDocumentationUrls(
+  context: UrlExtractionContext,
+): string[] {
+  const urlsInApplication = extractUrlsFromApplication(context.application);
+  context.logger.logLabeledInformation(
+    'Extracted URLs from application',
+    urlsInApplication.length.toString(),
+  );
+  const uniqueUrls = filterDuplicateUrls(urlsInApplication);
+  context.logger.logLabeledInformation(
+    'Unique URLs after deduplication',
+    `${uniqueUrls.length} (duplicates removed)`,
+  );
+  context.logger.logLabeledInformation(
+    'Exclusion patterns for URLs',
+    context.urlExclusionPatterns.length === 0
+      ? 'None (all URLs included)'
+      : context.urlExclusionPatterns.map((pattern, index) => `${index + 1}) ${pattern.toString()}`).join('\n'),
+  );
+  const includedUrls = filterUrlsExcludingPatterns(uniqueUrls, context.urlExclusionPatterns);
+  context.logger.logLabeledInformation(
+    'URLs extracted for testing',
+    `${includedUrls.length} (after applying exclusion patterns; ${uniqueUrls.length - includedUrls.length} URLs ignored)`,
+  );
+  return includedUrls;
+}
+
+function extractUrlsFromApplication(application: IApplication): string[] {
+  return [ // Get all executables
+    ...application.collections.flatMap((c) => c.getAllCategories()),
+    ...application.collections.flatMap((c) => c.getAllScripts()),
+  ]
+    // Get all docs
+    .flatMap((documentable) => documentable.docs)
+    // Parse all URLs
+    .flatMap((docString) => extractUrlsExcludingCodeBlocks(docString));
+}
+
+function filterDuplicateUrls(urls: readonly string[]): string[] {
+  return urls.filter((url, index, array) => array.indexOf(url) === index);
+}
+
+function filterUrlsExcludingPatterns(
+  urls: readonly string[],
+  patterns: readonly RegExp[],
+): string[] {
+  return urls.filter((url) => !patterns.some((pattern) => pattern.test(url)));
+}
+
+function extractUrlsExcludingCodeBlocks(textWithInlineCode: string): string[] {
+  /*
+    Matches URLs:
+    - Excludes inline code blocks as they may contain URLs not intended for user interaction
+      and not guaranteed to support expected HTTP methods, leading to false-negatives.
+    - Supports URLs containing parentheses, avoiding matches within code that might not represent
+      actual links.
+  */
+  const nonCodeBlockUrlRegex = /(?<!`)(https?:\/\/[^\s`"<>()]+(?:\([^\s`"<>()]*\))?[^\s`"<>()]*)/g;
+  return textWithInlineCode.match(nonCodeBlockUrlRegex) || [];
+}
diff --git a/tests/checks/external-urls/TestExecutionDetailsLogger.ts b/tests/checks/external-urls/TestExecutionDetailsLogger.ts
new file mode 100644
index 000000000..a8a05c33b
--- /dev/null
+++ b/tests/checks/external-urls/TestExecutionDetailsLogger.ts
@@ -0,0 +1,26 @@
+import { indentText } from '@tests/shared/Text';
+
+export class TestExecutionDetailsLogger {
+  public logTestSectionStartDelimiter(): void {
+    this.logSectionDelimiterLine();
+  }
+
+  public logTestSectionEndDelimiter(): void {
+    this.logSectionDelimiterLine();
+  }
+
+  public logLabeledInformation(
+    label: string,
+    detailedInformation: string,
+  ): void {
+    console.log([
+      `${label}:`,
+      indentText(detailedInformation),
+    ].join('\n'));
+  }
+
+  private logSectionDelimiterLine(): void {
+    const horizontalLine = '─'.repeat(40);
+    console.log(horizontalLine);
+  }
+}
diff --git a/tests/checks/external-urls/main.spec.ts b/tests/checks/external-urls/main.spec.ts
index 8fac7c71d..47538424b 100644
--- a/tests/checks/external-urls/main.spec.ts
+++ b/tests/checks/external-urls/main.spec.ts
@@ -1,19 +1,26 @@
 import { test, expect } from 'vitest';
 import { parseApplication } from '@/application/Parser/ApplicationParser';
-import type { IApplication } from '@/domain/IApplication';
 import { indentText } from '@tests/shared/Text';
 import { formatAssertionMessage } from '@tests/shared/FormatAssertionMessage';
+import { shuffle } from '@/application/Common/Shuffle';
 import { type UrlStatus, formatUrlStatus } from './StatusChecker/UrlStatus';
 import { getUrlStatusesInParallel, type BatchRequestOptions } from './StatusChecker/BatchStatusChecker';
+import { TestExecutionDetailsLogger } from './TestExecutionDetailsLogger';
+import { extractDocumentationUrls } from './DocumentationUrlExtractor';
 
 // arrange
+const logger = new TestExecutionDetailsLogger();
+logger.logTestSectionStartDelimiter();
 const app = parseApplication();
-const urls = collectUniqueUrls({
-  application: app,
-  excludePatterns: [
+let urls = extractDocumentationUrls({
+  logger,
+  urlExclusionPatterns: [
     /^https:\/\/archive\.ph/, // Drops HEAD/GET requests via fetch/curl, responding to Postman/Chromium.
   ],
+  application: app,
 });
+urls = filterUrlsToEnvironmentCheckLimit(urls);
+logger.logLabeledInformation('URLs submitted for testing', urls.length.toString());
 const requestOptions: BatchRequestOptions = {
   domainOptions: {
     sameDomainParallelize: false, // be nice to our third-party servers
@@ -30,55 +37,64 @@ const requestOptions: BatchRequestOptions = {
     enableCookies: true,
   },
 };
+logger.logLabeledInformation('HTTP request options', JSON.stringify(requestOptions, null, 2));
 const testTimeoutInMs = urls.length * 60 /* seconds */ * 1000;
+logger.logLabeledInformation('Scheduled test duration', convertMillisecondsToHumanReadableFormat(testTimeoutInMs));
+logger.logTestSectionEndDelimiter();
 test(`all URLs (${urls.length}) should be alive`, async () => {
   // act
   const results = await getUrlStatusesInParallel(urls, requestOptions);
   // assert
   const deadUrls = results.filter((r) => r.code === undefined || !isOkStatusCode(r.code));
-  expect(deadUrls).to.have.lengthOf(0, formatAssertionMessage([formatUrlStatusReport(deadUrls)]));
+  expect(deadUrls).to.have.lengthOf(
+    0,
+    formatAssertionMessage([createReportForDeadUrlStatuses(deadUrls)]),
+  );
 }, testTimeoutInMs);
 
 function isOkStatusCode(statusCode: number): boolean {
   return statusCode >= 200 && statusCode < 300;
 }
 
-function collectUniqueUrls(
-  options: {
-    readonly application: IApplication,
-    readonly excludePatterns?: readonly RegExp[],
-  },
-): string[] {
-  return [ // Get all nodes
-    ...options.application.collections.flatMap((c) => c.getAllCategories()),
-    ...options.application.collections.flatMap((c) => c.getAllScripts()),
-  ]
-    // Get all docs
-    .flatMap((documentable) => documentable.docs)
-    // Parse all URLs
-    .flatMap((docString) => extractUrls(docString))
-    // Remove duplicates
-    .filter((url, index, array) => array.indexOf(url) === index)
-    // Exclude certain URLs based on patterns
-    .filter((url) => !shouldExcludeUrl(url, options.excludePatterns ?? []));
+function createReportForDeadUrlStatuses(deadUrlStatuses: readonly UrlStatus[]): string {
+  return `\n${deadUrlStatuses.map((status) => indentText(formatUrlStatus(status))).join('\n---\n')}\n`;
 }
 
-function shouldExcludeUrl(url: string, patterns: readonly RegExp[]): boolean {
-  return patterns.some((pattern) => pattern.test(url));
+function filterUrlsToEnvironmentCheckLimit(originalUrls: string[]): string[] {
+  const { RANDOMIZED_URL_CHECK_LIMIT } = process.env;
+  logger.logLabeledInformation('URL check limit', RANDOMIZED_URL_CHECK_LIMIT || 'Unlimited');
+  if (RANDOMIZED_URL_CHECK_LIMIT !== undefined && RANDOMIZED_URL_CHECK_LIMIT !== '') {
+    const maxUrlsInTest = parseInt(RANDOMIZED_URL_CHECK_LIMIT, 10);
+    if (Number.isNaN(maxUrlsInTest)) {
+      throw new Error(`Invalid URL limit: ${RANDOMIZED_URL_CHECK_LIMIT}`);
+    }
+    if (maxUrlsInTest < originalUrls.length) {
+      return shuffle(originalUrls).slice(0, maxUrlsInTest);
+    }
+  }
+  return originalUrls;
 }
 
-function formatUrlStatusReport(deadUrlStatuses: readonly UrlStatus[]): string {
-  return `\n${deadUrlStatuses.map((status) => indentText(formatUrlStatus(status))).join('\n---\n')}\n`;
-}
+function convertMillisecondsToHumanReadableFormat(milliseconds: number): string {
+  const timeParts: string[] = [];
+  const addTimePart = (amount: number, label: string) => {
+    if (amount === 0) {
+      return;
+    }
+    timeParts.push(`${amount} ${label}`);
+  };
+
+  const hours = milliseconds / (1000 * 60 * 60);
+  const absoluteHours = Math.floor(hours);
+  addTimePart(absoluteHours, 'hours');
+
+  const minutes = (hours - absoluteHours) * 60;
+  const absoluteMinutes = Math.floor(minutes);
+  addTimePart(absoluteMinutes, 'minutes');
+
+  const seconds = (minutes - absoluteMinutes) * 60;
+  const absoluteSeconds = Math.floor(seconds);
+  addTimePart(absoluteSeconds, 'seconds');
 
-function extractUrls(textWithInlineCode: string): string[] {
-  /*
-    Matches URLs:
-    - Excludes inline code blocks as they may contain URLs not intended for user interaction
-      and not guaranteed to support expected HTTP methods, leading to false-negatives.
-    - Supports URLs containing parentheses, avoiding matches within code that might not represent
-      actual links.
-  */
-  const nonCodeBlockUrlRegex = /(?<!`)(https?:\/\/[^\s`"<>()]+(?:\([^\s`"<>()]*\))?[^\s`"<>()]*)/g;
-  return textWithInlineCode.match(nonCodeBlockUrlRegex) || [];
+  return timeParts.join(', ');
 }
diff --git a/tests/unit/application/Common/Shuffle.spec.ts b/tests/unit/application/Common/Shuffle.spec.ts
new file mode 100644
index 000000000..ebbb0aa98
--- /dev/null
+++ b/tests/unit/application/Common/Shuffle.spec.ts
@@ -0,0 +1,52 @@
+import { describe, it, expect } from 'vitest';
+import { shuffle } from '@/application/Common/Shuffle';
+
+describe('Shuffle', () => {
+  describe('shuffle', () => {
+    it('returns a new array', () => {
+      // arrange
+      const inputArray = ['a', 'b', 'c', 'd'];
+      // act
+      const result = shuffle(inputArray);
+      // assert
+      expect(result).not.to.equal(inputArray);
+    });
+
+    it('returns an array of the same length', () => {
+      // arrange
+      const inputArray = ['a', 'b', 'c', 'd'];
+      // act
+      const result = shuffle(inputArray);
+      // assert
+      expect(result.length).toBe(inputArray.length);
+    });
+
+    it('contains the same elements', () => {
+      // arrange
+      const inputArray = ['a', 'b', 'c', 'd'];
+      // act
+      const result = shuffle(inputArray);
+      // assert
+      expect(result).to.have.members(inputArray);
+    });
+
+    it('does not modify the input array', () => {
+      // arrange
+      const inputArray = ['a', 'b', 'c', 'd'];
+      const inputArrayCopy = [...inputArray];
+      // act
+      shuffle(inputArray);
+      // assert
+      expect(inputArray).to.deep.equal(inputArrayCopy);
+    });
+
+    it('handles an empty array correctly', () => {
+      // arrange
+      const inputArray: string[] = [];
+      // act
+      const result = shuffle(inputArray);
+      // assert
+      expect(result).have.lengthOf(0);
+    });
+  });
+});