From 40a7b69e379e3ae90f17904a36aeb98eb8c00a55 Mon Sep 17 00:00:00 2001 From: Dmytro Bogatov <dmytro@dbogatov.org> Date: Sun, 21 Jun 2020 23:30:59 -0400 Subject: [PATCH] Refactor. Split files. Add reporters (Console for now). Add colors. --- .vscode/launch.json | 15 +++ .vscode/tasks.json | 18 ++++ src/extractor.ts | 219 ------------------------------------------- src/index.ts | 17 +++- src/inspector.ts | 149 +++++++++++++++++++++++++++++ src/report.ts | 82 ++++++++++++++++ src/result.ts | 47 ++++++++++ test/extract-urls.ts | 60 ++++++------ test/process-url.ts | 11 ++- tsconfig.json | 3 +- 10 files changed, 361 insertions(+), 260 deletions(-) create mode 100644 .vscode/tasks.json delete mode 100644 src/extractor.ts create mode 100644 src/inspector.ts create mode 100644 src/report.ts create mode 100644 src/result.ts diff --git a/.vscode/launch.json b/.vscode/launch.json index a826e42..64da791 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -17,6 +17,21 @@ "console": "integratedTerminal", "internalConsoleOptions": "neverOpen", "protocol": "inspector" + }, + { + "type": "node", + "request": "launch", + "name": "Index", + "program": "${workspaceFolder}/dist/index.js", + "args": [ + "inspect", + "https://dbogatov.org" + ], + "console": "integratedTerminal", + "internalConsoleOptions": "neverOpen", + "protocol": "inspector", + "preLaunchTask": "build", + "outFiles": ["${workspaceFolder}/dist/**/*.js"] } ] } diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..e346f41 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,18 @@ +{ + "version": "2.0.0", + "tasks": [ + { + "label": "build", + "type": "shell", + "command": "tsc", + "args": [], + "options": { + "cwd": "${workspaceFolder}" + }, + "group": { + "kind": "build", + "isDefault": true + } + } + ] +} diff --git a/src/extractor.ts b/src/extractor.ts deleted file mode 100644 index 743225d..0000000 --- a/src/extractor.ts +++ /dev/null @@ -1,219 +0,0 @@ -import * as parser from "htmlparser2" -import axios, { AxiosError } from "axios" - -export async function processURL(originalUrl: URL, recursive: boolean, config: Config, matcher: URLsMatchingSet): Promise<Result> { - - let result = new Result(); - let urlsToCheck: [string, string?][] = [[originalUrl.href, undefined]] - - let processingRoutine = async (url: string, parent?: string) => { - - try { - url = parent ? new URL(url, parent).href : url - - if (result.isChecked(url) || config.ignoredExtensions.some(ext => url.startsWith(ext + ":"))) { - result.add({ url: url, status: CheckStatus.Skipped }, parent) - } else { - - const response = await axios.get(parent ? new URL(url, parent).href : url, { timeout: config.timeout }) - let html = response.data as string - - if (recursive && originalUrl.host == new URL(url).host) { - - let discoveredURLs = extractURLs(html, matcher) - - for (const discovered of discoveredURLs) { - urlsToCheck.push([discovered, url]) - } - } - - result.add({ url: url, status: CheckStatus.OK }, parent) - } - - } catch (exception) { - const error: AxiosError = exception; - - if (!error.response) { - result.add({ url: url, status: CheckStatus.GenericError }, parent) - } else { - if (config.acceptedCodes.some(code => code == error.response?.status)) { - result.add({ url: url, status: CheckStatus.OK }, parent) - } else { - result.add({ url: url, status: CheckStatus.NonSuccessCode, message: `${error.response.status}` }, parent) - } - } - } - - } - - let promises: Promise<void>[] = [] - - while (urlsToCheck.length > 0) { - - let [url, parent] = urlsToCheck.pop()! - - promises.push(processingRoutine(url, parent)) - - if (urlsToCheck.length == 0) { - await Promise.all(promises) - } - } - - return result -} - -export function extractURLs(html: string, matcher: URLsMatchingSet): Set<string> { - - let urls = new Set<string>(); - - let parserInstance = new parser.Parser( - { - onopentag(name, attributes) { - const match = matcher.match(name, attributes); - if (match && match !== "" && !match.startsWith("#")) { - urls.add(match) - } - } - }, - { decodeEntities: true } - ); - parserInstance.write(html) - parserInstance.end() - - return urls -} - -export class Config { - public acceptedCodes: number[] = [999] - public timeout: number = 2000 - public ignoredExtensions: string[] = ["mailto", "tel"] -} - -export class Result { - private pages = new Map<string, ResultItem[]>() - private checkedUrls = new Set<string>() - - public add(completedCheck: ResultItem, parent: string = "original request") { - // console.log(`${completedCheck.url} : ${completedCheck.status} ${completedCheck.message ? completedCheck.message : ""}`) // TODO - - if (this.pages.has(parent)) { - this.pages.get(parent)?.push(completedCheck) - } else { - this.pages.set(parent, [completedCheck]) - } - this.checkedUrls.add(completedCheck.url) - } - - public isChecked(url: string): boolean { - return this.checkedUrls.has(url) - } - - public count(): number { - let count = 0 - for (const page of this.pages.entries()) { - count += page[1].length - } - return count - } - - public report(): void { // TODO - - let allSkipped = 0 - let allOks = 0 - let allBroken = 0 - - for (const page of this.pages.entries()) { - console.log(page[0]) - - let skipped = 0 - let oks = 0 - let broken = 0 - - for (const check of page[1]) { - switch (check.status) { - case CheckStatus.OK: - oks++ - break - case CheckStatus.NonSuccessCode: - case CheckStatus.GenericError: - case CheckStatus.Timeout: - broken++ - break - case CheckStatus.Skipped: - skipped++ - break - } - - if (check.status != CheckStatus.Skipped) { - console.log(`\t${check.status} : ${check.url}`) - } - } - console.log(`\tOK: ${oks}, skipped: ${skipped}, broken: ${broken}`) - allOks += oks - allSkipped += skipped - allBroken += broken - } - console.log(`OK: ${allOks}, skipped: ${allSkipped}, broken: ${allBroken}`) - } -} - -export class ResultItem { - public url = "" - public status = CheckStatus.OK - public message?: string -} - -export enum CheckStatus { - OK = "OK", - Skipped = "SKIP", - Timeout = "TIMEOUT", - NonSuccessCode = "ERROR CODE", - GenericError = "UNKNOWN" -} - -export enum URLMatchingRule { - AHRef = "<a href>", - ScriptSrc = "<script src>", - LinkHref = "<link href>", - ImgSrc = "<img src>" -} - -export class URLsMatchingSet { - private rules: URLMatchingRule[] - - constructor(...rules: URLMatchingRule[]) { - this.rules = rules.length > 0 ? rules : Object.values(URLMatchingRule); - } - - public match(name: string, attributes: { [s: string]: string }): string | undefined { - - for (const rule of this.rules) { - switch (rule) { - case URLMatchingRule.AHRef: - if (name === "a" && "href" in attributes) { - return attributes.href - } - break; - case URLMatchingRule.ScriptSrc: - if (name === "script" && "src" in attributes) { - return attributes.src - } - break; - case URLMatchingRule.LinkHref: - if (name === "link" && "href" in attributes) { - return attributes.href - } - break; - case URLMatchingRule.ImgSrc: - if (name === "img" && "src" in attributes) { - return attributes.src - } - break; - default: - throw new Error(`unknown rule: ${rule}`); - } - } - - return undefined - } -} diff --git a/src/index.ts b/src/index.ts index acda401..5fd1034 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,16 +1,23 @@ import commander from "commander" import chalk from "chalk" -import { extractURLs } from "./extractor" +import { Inspector, URLsMatchingSet, Config } from "./inspector" +import { ConsoleReporter } from "./report" commander .version("0.1.0") // TODO automatically .description("Extract and recursively check all URLs reporting broken ones") commander - .command("list") - .description("List extracted URLs") - .action(() => { - console.log("Action list called"); + .command("inspect <url>") + .description("Check links in the given URL") + .option("-r, --recursive", "recursively check all links in all URLs within supplied host", false) + .action(async (url: string, inspectObj) => { + + let inspector = new Inspector(new URLsMatchingSet(), new Config()) + let result = await inspector.processURL(new URL(url), inspectObj.recursive) + let success = result.report(new ConsoleReporter()) + + process.exit(success ? 0 : 1) }) if (!process.argv.slice(2).length) { diff --git a/src/inspector.ts b/src/inspector.ts new file mode 100644 index 0000000..b97e1cd --- /dev/null +++ b/src/inspector.ts @@ -0,0 +1,149 @@ +import * as parser from "htmlparser2" +import axios, { AxiosError } from "axios" +import { Result, CheckStatus } from "./result"; + +export class Inspector { + + constructor( + private readonly matcher: URLsMatchingSet, + private readonly config: Config + ) { } + + async processURL(originalUrl: URL, recursive: boolean): Promise<Result> { + + let result = new Result(); + let urlsToCheck: [string, string?][] = [[originalUrl.href, undefined]] + + let processingRoutine = async (url: string, parent?: string) => { + + try { + url = parent ? new URL(url, parent).href : url + + if (result.isChecked(url) || this.config.ignoredExtensions.some(ext => url.startsWith(ext + ":"))) { + result.add({ url: url, status: CheckStatus.Skipped }, parent) + } else { + + const response = await axios.get(parent ? new URL(url, parent).href : url, { timeout: this.config.timeout }) + let html = response.data as string + + if (url == originalUrl.href || (recursive && originalUrl.host == new URL(url).host)) { + + let discoveredURLs = this.extractURLs(html) + + for (const discovered of discoveredURLs) { + urlsToCheck.push([discovered, url]) + } + } + + result.add({ url: url, status: CheckStatus.OK }, parent) + } + + } catch (exception) { + + const error: AxiosError = exception; + + if (!error.response) { + result.add({ url: url, status: CheckStatus.GenericError }, parent) + } else { + if (this.config.acceptedCodes.some(code => code == error.response?.status)) { + result.add({ url: url, status: CheckStatus.OK }, parent) + } else { + result.add({ url: url, status: CheckStatus.NonSuccessCode, message: `${error.response.status}` }, parent) + } + } + } + + } + + let promises: Promise<void>[] = [] + + while (urlsToCheck.length > 0) { + + let [url, parent] = urlsToCheck.pop()! + + promises.push(processingRoutine(url, parent)) + + if (urlsToCheck.length == 0) { + await Promise.all(promises) + } + } + + return result + } + + extractURLs(html: string): Set<string> { + + let urls = new Set<string>(); + let matcher = this.matcher + + let parserInstance = new parser.Parser( + { + onopentag(name, attributes) { + const match = matcher.match(name, attributes); + if (match && match !== "" && !match.startsWith("#")) { + urls.add(match) + } + } + }, + { decodeEntities: true } + ); + parserInstance.write(html) + parserInstance.end() + + return urls + } +} + + +export class Config { + public acceptedCodes: number[] = [999] + public timeout: number = 2000 + public ignoredExtensions: string[] = ["mailto", "tel"] +} + +export enum URLMatchingRule { + AHRef = "<a href>", + ScriptSrc = "<script src>", + LinkHref = "<link href>", + ImgSrc = "<img src>" +} + +export class URLsMatchingSet { + private rules: URLMatchingRule[] + + constructor(...rules: URLMatchingRule[]) { + this.rules = rules.length > 0 ? rules : Object.values(URLMatchingRule); + } + + public match(name: string, attributes: { [s: string]: string }): string | undefined { + + for (const rule of this.rules) { + switch (rule) { + case URLMatchingRule.AHRef: + if (name === "a" && "href" in attributes) { + return attributes.href + } + break; + case URLMatchingRule.ScriptSrc: + if (name === "script" && "src" in attributes) { + return attributes.src + } + break; + case URLMatchingRule.LinkHref: + if (name === "link" && "href" in attributes) { + return attributes.href + } + break; + case URLMatchingRule.ImgSrc: + if (name === "img" && "src" in attributes) { + return attributes.src + } + break; + default: + throw new Error(`unknown rule: ${rule}`); + } + } + + return undefined + } +} diff --git a/src/report.ts b/src/report.ts new file mode 100644 index 0000000..c92af94 --- /dev/null +++ b/src/report.ts @@ -0,0 +1,82 @@ +import { ResultItem, CheckStatus } from "./result"; +import chalk from "chalk"; + +export interface IReporter { + process(pages: Map<string, ResultItem[]>): boolean +} + +// ConsoleReporter + +export class ConsoleReporter implements IReporter { + + private printTotals(oks: number, skipped: number, broken: number, indent: boolean = true) { + console.log(`${indent ? "\t" : ""}${chalk.green(`OK: ${oks}`)}, ${chalk.grey(`skipped: ${skipped}`)}, ${chalk.red(`broken: ${broken}`)}`) + } + + private printCheck(status: CheckStatus, url: string) { + let statusLabel: string + const labelWidth = 7 + + switch (status) { + case CheckStatus.OK: + statusLabel = chalk.green("OK".padEnd(labelWidth)) + break; + case CheckStatus.Skipped: + statusLabel = chalk.gray("SKIP".padEnd(labelWidth)) + break; + case CheckStatus.Timeout: + statusLabel = chalk.yellow("TIMEOUT".padEnd(labelWidth)) + break; + case CheckStatus.NonSuccessCode: + case CheckStatus.GenericError: + statusLabel = chalk.red("BROKEN".padEnd(labelWidth)) + break; + } + + if (status != CheckStatus.Skipped) { + console.log(`\t${statusLabel} : ${chalk.italic(url)}`) + } + } + + process(pages: Map<string, ResultItem[]>): boolean { + + let allSkipped = 0 + let allOks = 0 + let allBroken = 0 + + for (const page of pages.entries()) { + console.log(page[0]) + + let skipped = 0 + let oks = 0 + let broken = 0 + + for (const check of page[1]) { + switch (check.status) { + case CheckStatus.OK: + oks++ + break + case CheckStatus.NonSuccessCode: + case CheckStatus.GenericError: + case CheckStatus.Timeout: + broken++ + break + case CheckStatus.Skipped: + skipped++ + break + } + + this.printCheck(check.status, check.url) + + } + this.printTotals(oks, skipped, broken) + allOks += oks + allSkipped += skipped + allBroken += broken + } + this.printTotals(allOks, allSkipped, allBroken, false) + + return allBroken == 0 + } + +} diff --git a/src/result.ts b/src/result.ts new file mode 100644 index 0000000..3b0f24c --- /dev/null +++ b/src/result.ts @@ -0,0 +1,47 @@ +import { IReporter } from "./report" + +export class Result { + private pages = new Map<string, ResultItem[]>() + private checkedUrls = new Set<string>() + + public add(completedCheck: ResultItem, parent: string = "original request") { + // console.log(`${completedCheck.url} : ${completedCheck.status} ${completedCheck.message ? completedCheck.message : ""}`) // TODO + + if (this.pages.has(parent)) { + this.pages.get(parent)?.push(completedCheck) + } else { + this.pages.set(parent, [completedCheck]) + } + this.checkedUrls.add(completedCheck.url) + } + + public isChecked(url: string): boolean { + return this.checkedUrls.has(url) + } + + public count(): number { + let count = 0 + for (const page of this.pages.entries()) { + count += page[1].length + } + return count + } + + public report<ReporterT extends IReporter>(reporter: ReporterT): boolean { + return reporter.process(this.pages) + } +} + +export class ResultItem { + public url = "" + public status = CheckStatus.OK + public message?: string +} + +export enum CheckStatus { + OK = "OK", + Skipped = "SKIP", + Timeout = "TIMEOUT", + NonSuccessCode = "ERROR CODE", + GenericError = "UNKNOWN" +} diff --git a/test/extract-urls.ts b/test/extract-urls.ts index 04b57f5..d56611a 100644 --- a/test/extract-urls.ts +++ b/test/extract-urls.ts @@ -1,4 +1,4 @@ -import { extractURLs, URLMatchingRule, URLsMatchingSet } from "../src/extractor" +import { Inspector, URLsMatchingSet, URLMatchingRule, Config } from "../src/inspector" import { expect, assert } from "chai"; import "mocha"; @@ -7,63 +7,63 @@ describe("extractURLs", () => { const url = "dbogatov.org" it("works for <a href=...>", () => { - const result = extractURLs(`<html><a href="${url}">Text</a></html>`, new URLsMatchingSet(URLMatchingRule.AHRef)) + const result = new Inspector(new URLsMatchingSet(URLMatchingRule.AHRef), new Config()).extractURLs(`<html><a href="${url}">Text</a></html>`) expect(result).to.eql(new Set([url])) }); it("works for <script src=...>", () => { - const result = extractURLs(`<html><script src="${url}">Text</script></html>`, new URLsMatchingSet(URLMatchingRule.ScriptSrc)) + const result = new Inspector(new URLsMatchingSet(URLMatchingRule.ScriptSrc), new Config()).extractURLs(`<html><script src="${url}">Text</script></html>`) expect(result).to.eql(new Set([url])) }); it("works for <link href=...>", () => { - const result = extractURLs(`<html><link href="${url}"></link></html>`, new URLsMatchingSet(URLMatchingRule.LinkHref)) + const result = new Inspector(new URLsMatchingSet(URLMatchingRule.LinkHref), new Config()).extractURLs(`<html><link href="${url}"></link></html>`) expect(result).to.eql(new Set([url])) }); it("works for <img src=...>", () => { - const result = extractURLs(`<html><img src="${url}">Text</img></html>`, new URLsMatchingSet(URLMatchingRule.ImgSrc)) + const result = new Inspector(new URLsMatchingSet(URLMatchingRule.ImgSrc), new Config()).extractURLs(`<html><img src="${url}">Text</img></html>`) expect(result).to.eql(new Set([url])) }); it("works for many rules", () => { - const result = extractURLs( - `<html> - <a href="1">Text</a> - <script src="2">Text</script> - <link href="3"></link> - <img src="4">Text</img> - </html>`, - new URLsMatchingSet() - ) + const result = new Inspector(new URLsMatchingSet(), new Config()) + .extractURLs( + `<html> + <a href="1">Text</a> + <script src="2">Text</script> + <link href="3"></link> + <img src="4">Text</img> + </html>` + ) expect(result).to.eql(new Set(["1", "2", "3", "4"])) }); it("does not match unless rule supplied", () => { - const result = extractURLs( - `<html> - <img src="${url}">Text</img> - <link href="another-url"></link> - </html>`, - new URLsMatchingSet(URLMatchingRule.ImgSrc) - ) + const result = new Inspector(new URLsMatchingSet(URLMatchingRule.ImgSrc), new Config()) + .extractURLs( + `<html> + <img src="${url}">Text</img> + <link href="another-url"></link> + </html>` + ) expect(result).to.eql(new Set([url])) }); it("filters duplicates", () => { - const result = extractURLs( - `<html> - <img src="${url}">Text</img> - <script src="${url}">Text</script> - <link href="another-url"></link> - </html>`, - new URLsMatchingSet() - ) + const result = new Inspector(new URLsMatchingSet(), new Config()) + .extractURLs( + `<html> + <img src="${url}">Text</img> + <script src="${url}">Text</script> + <link href="another-url"></link> + </html>` + ) expect(result).to.eql(new Set([url, "another-url"])) }); it("fails for unknown rule", () => { - assert.throws(() => extractURLs(`<html><img src="${url}">Text</img></html>`, new URLsMatchingSet("error" as URLMatchingRule)), /unknown/) + assert.throws(() => new Inspector(new URLsMatchingSet("error" as URLMatchingRule), new Config()).extractURLs(`<html><img src="${url}">Text</img></html>`), /unknown/) }); }); diff --git a/test/process-url.ts b/test/process-url.ts index 698a438..0f1f852 100644 --- a/test/process-url.ts +++ b/test/process-url.ts @@ -1,6 +1,7 @@ -import { extractURLs, URLMatchingRule, URLsMatchingSet, processURL, CheckStatus, Config } from "../src/extractor" +import { Inspector, URLsMatchingSet, URLMatchingRule, Config } from "../src/inspector" import { expect, assert } from "chai"; import "mocha"; +import { ConsoleReporter } from "../src/report"; describe("processURL", function () { @@ -9,17 +10,17 @@ describe("processURL", function () { const validURL = new URL("https://dbogatov.org") it("processes non-recursive", async () => { - const result = await processURL(validURL, false, new Config(), new URLsMatchingSet()) + const result = await new Inspector(new URLsMatchingSet(), new Config()).processURL(validURL, false) - assert(result.count() == 1) + assert(result.count() > 1) // assert(result[0].url === validURL.href) // assert(result[0].status == CheckStatus.OK) }); it("processes recursive", async () => { - const result = await processURL(validURL, true, new Config(), new URLsMatchingSet()) + const result = await new Inspector(new URLsMatchingSet(), new Config()).processURL(validURL, true) - result.report() + result.report(new ConsoleReporter()) // assert(result.length == 1) // assert(result[0].url === validURL.href) diff --git a/tsconfig.json b/tsconfig.json index df4dc68..46b8ceb 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -8,7 +8,8 @@ "esModuleInterop": true, "skipLibCheck": true, "forceConsistentCasingInFileNames": true, - "noImplicitAny": true + "noImplicitAny": true, + "sourceMap": true }, "include": [ "src/**/*", -- GitLab