diff --git a/README.md b/README.md index 935e0476c2d5154b5542084c2eb51920f3047a89..a4525e6b5f55a5cbc33556fb37aae3e33a61b2a7 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,8 @@ Options: -s, --skip <globs> URLs to skip defined by globs, like '*linkedin*' (default: []) --reporters <coma-separated-strings> Reporters to use in processing the results (junit, console) (default: ["console"]) --retries <number> The number of times to retry TIMEOUT URLs (default: 3) + --user-agent <string> The User-Agent header (default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 + (KHTML, like Gecko) Version/14.1 Safari/605.1.15") --ignore-prefixes <coma-separated-strings> prefix(es) to ignore (without ':'), like mailto: and tel: (default: ["javascript","data","mailto","sms","tel","geo"]) --accept-codes <coma-separated-numbers> HTTP response code(s) (beyond 200-299) to accept, like 999 for linkedin (default: [999]) --ignore-skipped Do not report skipped URLs (default: false) @@ -180,6 +182,8 @@ JUnit file treats pages as test suites and URLs in a page as test cases. `--retries` will instruct the number of times to try a URL before declaring it failed. +`--user-agent <string>` will use specified `User-Agent` header (some websites reply with 401 Unauthorized for "bots") + `--ignore-prefixes <coma-separated-strings>` is a list of prefixes/ schemas to skip, such as `mailto:`. Provided list should not include colons. diff --git a/package.json b/package.json index 6fa7ef29744b0457c882dffb4d92d558bb4f1f4b..4e2b0d766af99bb5eed37e9cbc2f299391bd6da9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "broken-links-inspector", - "version": "1.3.4", + "version": "1.4.0", "description": "Extract and recursively check all URLs reporting broken ones", "main": "dist/inspector.js", "types": "dist/inspector.d.ts", diff --git a/src/index.ts b/src/index.ts index d7cf211da4a6c3317bef3a33b4b55c1fe7cc7bb2..49d45bae10c277b2a8e3c87e059e7e0cad71060a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -7,7 +7,7 @@ import { ConsoleReporter, JUnitReporter } from "./report" import fs from "fs/promises" commander - .version("1.3.4") + .version("1.4.0") .description("Extract and recursively check all URLs reporting broken ones\n\nDedicated to Daria Bogatova \u2665") commander @@ -19,6 +19,7 @@ commander .option("-s, --skip <globs>", "URLs to skip defined by globs, like '*linkedin*'", (value: string, previous: string[]) => previous.concat([value]), []) .option("--reporters <coma-separated-strings>", "Reporters to use in processing the results (junit, console)", (value: string, _) => value.split(","), ["console"]) .option("--retries <number>", "The number of times to retry TIMEOUT URLs", (value: string, _) => parseInt(value), 3) + .option("--user-agent <string>", "The User-Agent header", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15") .option("--ignore-prefixes <coma-separated-strings>", "prefix(es) to ignore (without ':'), like mailto: and tel:", (value: string, _) => value.split(","), ["javascript", "data", "mailto", "sms", "tel", "geo"]) .option("--accept-codes <coma-separated-numbers>", "HTTP response code(s) (beyond 200-299) to accept, like 999 for linkedin", (value: string, _) => value.split(",").map(code => parseInt(code)), [999]) .option("--ignore-skipped", "Do not report skipped URLs", false) @@ -63,7 +64,8 @@ commander ignoreSkipped: inspectObj.ignoreSkipped as boolean, singleThreaded: inspectObj.singleThreaded as boolean, disablePrint: false, - retries: inspectObj.retries as number + retries: inspectObj.retries as number, + userAgent: inspectObj.userAgent as string }) if (urls.length == 0) { diff --git a/src/inspector.ts b/src/inspector.ts index ea51a7f1836342f3865f16dce0de3138a29ca9e7..7fb9b2913c27dac0d40d8b9d06440b2e75784b7f 100644 --- a/src/inspector.ts +++ b/src/inspector.ts @@ -5,7 +5,7 @@ import { isMatch } from "matcher" import pluralize from "pluralize" export interface IHttpClient { - request(get: boolean, url: string): Promise<string> + request(get: boolean, url: string, ua: string): Promise<string> } export class HttpClientFailure { @@ -37,12 +37,22 @@ export class AxiosHttpClient implements IHttpClient { return result } - async request(get: boolean, url: string): Promise<string> { + async request(get: boolean, url: string, ua: string): Promise<string> { const instance = axios.create() + const headers = { + "User-Agent": ua, + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-us", + "Connection": "keep-alive" + } + try { - return (await this.timeoutWrapper(this.timeout, () => get ? instance.get(url) : instance.head(url))).data as string + return (await this.timeoutWrapper( + this.timeout, + () => get ? instance.get(url, { headers: headers }) : instance.head(url, { headers: { headers: headers } }) + )).data as string } catch (exception) { const error: AxiosError = exception @@ -102,7 +112,7 @@ export class Inspector { } else { const urlToCheck = parent ? new URL(url, parent).href : url - const html = await this.httpClient.request(useGet || shouldParse, urlToCheck) + const html = await this.httpClient.request(useGet || shouldParse, urlToCheck, this.config.userAgent) if (shouldParse) { @@ -200,6 +210,7 @@ export class Config { singleThreaded = false disablePrint = false retries = 3 + userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15" } export enum URLMatchingRule { diff --git a/test/process-url.ts b/test/process-url.ts index 519399d1e9b51c70f8699294ed69f40eb543e46d..b990d55535ba09ef26bfa99d5c680fbc198c2131 100644 --- a/test/process-url.ts +++ b/test/process-url.ts @@ -12,7 +12,7 @@ class MockHttpClient implements IHttpClient { // Map<url, [response, timeout, failure, code, retries]> constructor(readonly map: Map<string, [string[], boolean, boolean, number, number]>) { } - async request(get: boolean, url: string): Promise<string> { + async request(get: boolean, url: string, _: string): Promise<string> { // eslint-disable-next-line @typescript-eslint/no-non-null-assertion const [urls, timeout, failure, code, retries] = this.map.get(url)! @@ -120,13 +120,15 @@ const expectedNonRecursive = new Map<string, ResultItem[]>([ describe("Axios web server", async () => { + const ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15" + it("OK", async () => { - await new AxiosHttpClient(5000, []).request(false, "https://dbogatov.org") + await new AxiosHttpClient(5000, []).request(false, "https://dbogatov.org", ua) }) it("timeout", async () => { try { - await new AxiosHttpClient(5, []).request(false, "https://dbogatov.org") + await new AxiosHttpClient(5, []).request(false, "https://dbogatov.org", ua) } catch (exception) { const error: HttpClientFailure = exception assert(error.timeout) @@ -135,7 +137,7 @@ describe("Axios web server", async () => { it("404", async () => { try { - await new AxiosHttpClient(2000, []).request(false, "https://dbogatov.org/not-found-123") + await new AxiosHttpClient(2000, []).request(false, "https://dbogatov.org/not-found-123", ua) } catch (exception) { const error: HttpClientFailure = exception assert(error.code == 404) @@ -144,7 +146,7 @@ describe("Axios web server", async () => { it("generic", async () => { try { - await new AxiosHttpClient(1000, []).request(true, "ftp://bad-url-54234534.com") + await new AxiosHttpClient(1000, []).request(true, "ftp://bad-url-54234534.com", ua) } catch (exception) { const error: HttpClientFailure = exception assert(!error.timeout)