diff --git a/package-lock.json b/package-lock.json index ddacca2566975652df59c6690c4fdc5118f56b84..e668574a872c198d434446da897db3fbb4c15209 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "@dbogatov/broken-links-inspector", - "version": "0.1.1", + "version": "0.1.2", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -425,6 +425,14 @@ "integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==", "dev": true }, + "axios": { + "version": "0.19.2", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.19.2.tgz", + "integrity": "sha512-fjgm5MvRHLhx+osE2xoekY70AhARk3a6hkN+3Io1jc00jtquGvxYlKlsFUhmUET0V5te6CcZI7lcv2Ym61mjHA==", + "requires": { + "follow-redirects": "1.5.10" + } + }, "balanced-match": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", @@ -837,6 +845,29 @@ "is-buffer": "~2.0.3" } }, + "follow-redirects": { + "version": "1.5.10", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz", + "integrity": "sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==", + "requires": { + "debug": "=3.1.0" + }, + "dependencies": { + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "requires": { + "ms": "2.0.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=" + } + } + }, "foreground-child": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-2.0.0.tgz", diff --git a/package.json b/package.json index ec7c5686d980f0f949467397b351464848a2f87c..766b5100e8af64eeba456e8dd1f5c0cb8c9ca7ed 100644 --- a/package.json +++ b/package.json @@ -8,8 +8,9 @@ }, "scripts": { "clean": "rm -rf .nyc_output coverage dist test-results.xml", - "test": "mocha --reporter mocha-junit-reporter -r ts-node/register test/**/*.ts", - "coverage": "nyc --reporter=html --reporter=cobertura --reporter=text -e .ts -n \"src/**/*.ts\" -x \"test/**/*.ts\" npm run test" + "test": "mocha --reporter spec -r ts-node/register test/**/*.ts", + "test-junit": "mocha --reporter mocha-junit-reporter -r ts-node/register test/**/*.ts", + "coverage": "nyc --reporter=html --reporter=cobertura --reporter=text -e .ts -n \"src/**/*.ts\" -x \"test/**/*.ts\" npm run test-junit" }, "publishConfig": { "@dbogatov:registry": "https://git.dbogatov.org/api/v4/projects/227/packages/npm/" @@ -30,6 +31,7 @@ "license": "MIT", "dependencies": { "@types/node": "^14.0.13", + "axios": "^0.19.2", "chalk": "^4.1.0", "commander": "^5.1.0", "htmlparser2": "^4.1.0" diff --git a/src/extractor.ts b/src/extractor.ts index 78b896d01070f807517ec831aa9c54a84ef6773c..743225d79d75464d3b3729495836cbd726f35a0c 100644 --- a/src/extractor.ts +++ b/src/extractor.ts @@ -1,15 +1,77 @@ import * as parser from "htmlparser2" +import axios, { AxiosError } from "axios" -export function extractURLs(html: string, matcher: URLsMatchingSet): string[] { +export async function processURL(originalUrl: URL, recursive: boolean, config: Config, matcher: URLsMatchingSet): Promise<Result> { - let urls: string[] = [] + let result = new Result(); + let urlsToCheck: [string, string?][] = [[originalUrl.href, undefined]] + + let processingRoutine = async (url: string, parent?: string) => { + + try { + url = parent ? new URL(url, parent).href : url + + if (result.isChecked(url) || config.ignoredExtensions.some(ext => url.startsWith(ext + ":"))) { + result.add({ url: url, status: CheckStatus.Skipped }, parent) + } else { + + const response = await axios.get(parent ? new URL(url, parent).href : url, { timeout: config.timeout }) + let html = response.data as string + + if (recursive && originalUrl.host == new URL(url).host) { + + let discoveredURLs = extractURLs(html, matcher) + + for (const discovered of discoveredURLs) { + urlsToCheck.push([discovered, url]) + } + } + + result.add({ url: url, status: CheckStatus.OK }, parent) + } + + } catch (exception) { + const error: AxiosError = exception; + + if (!error.response) { + result.add({ url: url, status: CheckStatus.GenericError }, parent) + } else { + if (config.acceptedCodes.some(code => code == error.response?.status)) { + result.add({ url: url, status: CheckStatus.OK }, parent) + } else { + result.add({ url: url, status: CheckStatus.NonSuccessCode, message: `${error.response.status}` }, parent) + } + } + } + + } + + let promises: Promise<void>[] = [] + + while (urlsToCheck.length > 0) { + + let [url, parent] = urlsToCheck.pop()! + + promises.push(processingRoutine(url, parent)) + + if (urlsToCheck.length == 0) { + await Promise.all(promises) + } + } + + return result +} + +export function extractURLs(html: string, matcher: URLsMatchingSet): Set<string> { + + let urls = new Set<string>(); let parserInstance = new parser.Parser( { onopentag(name, attributes) { const match = matcher.match(name, attributes); - if (match !== undefined && match !== "" && !match.startsWith("#")) { - urls.push(match) + if (match && match !== "" && !match.startsWith("#")) { + urls.add(match) } } }, @@ -18,7 +80,95 @@ export function extractURLs(html: string, matcher: URLsMatchingSet): string[] { parserInstance.write(html) parserInstance.end() - return urls.filter((value, index, self) => self.indexOf(value) === index) + return urls +} + +export class Config { + public acceptedCodes: number[] = [999] + public timeout: number = 2000 + public ignoredExtensions: string[] = ["mailto", "tel"] +} + +export class Result { + private pages = new Map<string, ResultItem[]>() + private checkedUrls = new Set<string>() + + public add(completedCheck: ResultItem, parent: string = "original request") { + // console.log(`${completedCheck.url} : ${completedCheck.status} ${completedCheck.message ? completedCheck.message : ""}`) // TODO + + if (this.pages.has(parent)) { + this.pages.get(parent)?.push(completedCheck) + } else { + this.pages.set(parent, [completedCheck]) + } + this.checkedUrls.add(completedCheck.url) + } + + public isChecked(url: string): boolean { + return this.checkedUrls.has(url) + } + + public count(): number { + let count = 0 + for (const page of this.pages.entries()) { + count += page[1].length + } + return count + } + + public report(): void { // TODO + + let allSkipped = 0 + let allOks = 0 + let allBroken = 0 + + for (const page of this.pages.entries()) { + console.log(page[0]) + + let skipped = 0 + let oks = 0 + let broken = 0 + + for (const check of page[1]) { + switch (check.status) { + case CheckStatus.OK: + oks++ + break + case CheckStatus.NonSuccessCode: + case CheckStatus.GenericError: + case CheckStatus.Timeout: + broken++ + break + case CheckStatus.Skipped: + skipped++ + break + } + + if (check.status != CheckStatus.Skipped) { + console.log(`\t${check.status} : ${check.url}`) + } + } + console.log(`\tOK: ${oks}, skipped: ${skipped}, broken: ${broken}`) + allOks += oks + allSkipped += skipped + allBroken += broken + } + console.log(`OK: ${allOks}, skipped: ${allSkipped}, broken: ${allBroken}`) + } +} + +export class ResultItem { + public url = "" + public status = CheckStatus.OK + public message?: string +} + +export enum CheckStatus { + OK = "OK", + Skipped = "SKIP", + Timeout = "TIMEOUT", + NonSuccessCode = "ERROR CODE", + GenericError = "UNKNOWN" } export enum URLMatchingRule { diff --git a/test/extract-urls.ts b/test/extract-urls.ts index 77eaf22397b9b47742804b5f2a65986e8ad96816..04b57f58a74613ae8f7d3b34e01de5f25336a119 100644 --- a/test/extract-urls.ts +++ b/test/extract-urls.ts @@ -8,22 +8,22 @@ describe("extractURLs", () => { it("works for <a href=...>", () => { const result = extractURLs(`<html><a href="${url}">Text</a></html>`, new URLsMatchingSet(URLMatchingRule.AHRef)) - expect(result).to.eql([url]) + expect(result).to.eql(new Set([url])) }); it("works for <script src=...>", () => { const result = extractURLs(`<html><script src="${url}">Text</script></html>`, new URLsMatchingSet(URLMatchingRule.ScriptSrc)) - expect(result).to.eql([url]) + expect(result).to.eql(new Set([url])) }); it("works for <link href=...>", () => { const result = extractURLs(`<html><link href="${url}"></link></html>`, new URLsMatchingSet(URLMatchingRule.LinkHref)) - expect(result).to.eql([url]) + expect(result).to.eql(new Set([url])) }); it("works for <img src=...>", () => { const result = extractURLs(`<html><img src="${url}">Text</img></html>`, new URLsMatchingSet(URLMatchingRule.ImgSrc)) - expect(result).to.eql([url]) + expect(result).to.eql(new Set([url])) }); it("works for many rules", () => { @@ -36,7 +36,7 @@ describe("extractURLs", () => { </html>`, new URLsMatchingSet() ) - expect(result).to.eql(["1", "2", "3", "4"]) + expect(result).to.eql(new Set(["1", "2", "3", "4"])) }); it("does not match unless rule supplied", () => { @@ -47,7 +47,7 @@ describe("extractURLs", () => { </html>`, new URLsMatchingSet(URLMatchingRule.ImgSrc) ) - expect(result).to.eql([url]) + expect(result).to.eql(new Set([url])) }); it("filters duplicates", () => { @@ -59,7 +59,7 @@ describe("extractURLs", () => { </html>`, new URLsMatchingSet() ) - expect(result).to.eql([url, "another-url"]) + expect(result).to.eql(new Set([url, "another-url"])) }); it("fails for unknown rule", () => { diff --git a/test/process-url.ts b/test/process-url.ts new file mode 100644 index 0000000000000000000000000000000000000000..698a438d55dfee75576f2fd1707dd617fb08d689 --- /dev/null +++ b/test/process-url.ts @@ -0,0 +1,29 @@ +import { extractURLs, URLMatchingRule, URLsMatchingSet, processURL, CheckStatus, Config } from "../src/extractor" +import { expect, assert } from "chai"; +import "mocha"; + +describe("processURL", function () { + + this.timeout(50_000); + + const validURL = new URL("https://dbogatov.org") + + it("processes non-recursive", async () => { + const result = await processURL(validURL, false, new Config(), new URLsMatchingSet()) + + assert(result.count() == 1) + // assert(result[0].url === validURL.href) + // assert(result[0].status == CheckStatus.OK) + }); + + it("processes recursive", async () => { + const result = await processURL(validURL, true, new Config(), new URLsMatchingSet()) + + result.report() + + // assert(result.length == 1) + // assert(result[0].url === validURL.href) + // assert(result[0].status == CheckStatus.OK) + }); + +});