Skip to content
Snippets Groups Projects
Commit 0e1c3010 authored by Dmytro Bogatov's avatar Dmytro Bogatov :two_hearts:
Browse files

Merge branch '1-build-url-graph' into 'master'

Resolve "Build URL graph"

Closes #1

See merge request !1
parents f95fed25 a21a79d4
Branches
No related tags found
1 merge request!1Resolve "Build URL graph"
Pipeline #7062 passed
......@@ -44,7 +44,8 @@ release:
npm publish
fi
rules:
- allow_failure: true
- if: '$CI_COMMIT_REF_NAME == "master"'
when: never
allow_failure: true
tags:
- docker
{
"name": "@dbogatov/broken-links-inspector",
"version": "0.1.1",
"version": "0.1.2",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
......@@ -425,6 +425,14 @@
"integrity": "sha512-jgsaNduz+ndvGyFt3uSuWqvy4lCnIJiovtouQN5JZHOKCS2QuhEdbcQHFhVksz2N2U9hXJo8odG7ETyWlEeuDw==",
"dev": true
},
"axios": {
"version": "0.19.2",
"resolved": "https://registry.npmjs.org/axios/-/axios-0.19.2.tgz",
"integrity": "sha512-fjgm5MvRHLhx+osE2xoekY70AhARk3a6hkN+3Io1jc00jtquGvxYlKlsFUhmUET0V5te6CcZI7lcv2Ym61mjHA==",
"requires": {
"follow-redirects": "1.5.10"
}
},
"balanced-match": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
......@@ -837,6 +845,29 @@
"is-buffer": "~2.0.3"
}
},
"follow-redirects": {
"version": "1.5.10",
"resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz",
"integrity": "sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==",
"requires": {
"debug": "=3.1.0"
},
"dependencies": {
"debug": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz",
"integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==",
"requires": {
"ms": "2.0.0"
}
},
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
}
}
},
"foreground-child": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-2.0.0.tgz",
......
......@@ -8,8 +8,9 @@
},
"scripts": {
"clean": "rm -rf .nyc_output coverage dist test-results.xml",
"test": "mocha --reporter mocha-junit-reporter -r ts-node/register test/**/*.ts",
"coverage": "nyc --reporter=html --reporter=cobertura --reporter=text -e .ts -n \"src/**/*.ts\" -x \"test/**/*.ts\" npm run test"
"test": "mocha --reporter spec -r ts-node/register test/**/*.ts",
"test-junit": "mocha --reporter mocha-junit-reporter -r ts-node/register test/**/*.ts",
"coverage": "nyc --reporter=html --reporter=cobertura --reporter=text -e .ts -n \"src/**/*.ts\" -x \"test/**/*.ts\" npm run test-junit"
},
"publishConfig": {
"@dbogatov:registry": "https://git.dbogatov.org/api/v4/projects/227/packages/npm/"
......@@ -30,6 +31,7 @@
"license": "MIT",
"dependencies": {
"@types/node": "^14.0.13",
"axios": "^0.19.2",
"chalk": "^4.1.0",
"commander": "^5.1.0",
"htmlparser2": "^4.1.0"
......
import * as parser from "htmlparser2"
import axios, { AxiosError } from "axios"
export function extractURLs(html: string, matcher: URLsMatchingSet): string[] {
export async function processURL(originalUrl: URL, recursive: boolean, config: Config, matcher: URLsMatchingSet): Promise<Result> {
let urls: string[] = []
let result = new Result();
let urlsToCheck: [string, string?][] = [[originalUrl.href, undefined]]
let processingRoutine = async (url: string, parent?: string) => {
try {
url = parent ? new URL(url, parent).href : url
if (result.isChecked(url) || config.ignoredExtensions.some(ext => url.startsWith(ext + ":"))) {
result.add({ url: url, status: CheckStatus.Skipped }, parent)
} else {
const response = await axios.get(parent ? new URL(url, parent).href : url, { timeout: config.timeout })
let html = response.data as string
if (recursive && originalUrl.host == new URL(url).host) {
let discoveredURLs = extractURLs(html, matcher)
for (const discovered of discoveredURLs) {
urlsToCheck.push([discovered, url])
}
}
result.add({ url: url, status: CheckStatus.OK }, parent)
}
} catch (exception) {
const error: AxiosError = exception;
if (!error.response) {
result.add({ url: url, status: CheckStatus.GenericError }, parent)
} else {
if (config.acceptedCodes.some(code => code == error.response?.status)) {
result.add({ url: url, status: CheckStatus.OK }, parent)
} else {
result.add({ url: url, status: CheckStatus.NonSuccessCode, message: `${error.response.status}` }, parent)
}
}
}
}
let promises: Promise<void>[] = []
while (urlsToCheck.length > 0) {
let [url, parent] = urlsToCheck.pop()!
promises.push(processingRoutine(url, parent))
if (urlsToCheck.length == 0) {
await Promise.all(promises)
}
}
return result
}
export function extractURLs(html: string, matcher: URLsMatchingSet): Set<string> {
let urls = new Set<string>();
let parserInstance = new parser.Parser(
{
onopentag(name, attributes) {
const match = matcher.match(name, attributes);
if (match !== undefined && match !== "" && !match.startsWith("#")) {
urls.push(match)
if (match && match !== "" && !match.startsWith("#")) {
urls.add(match)
}
}
},
......@@ -18,7 +80,95 @@ export function extractURLs(html: string, matcher: URLsMatchingSet): string[] {
parserInstance.write(html)
parserInstance.end()
return urls.filter((value, index, self) => self.indexOf(value) === index)
return urls
}
export class Config {
public acceptedCodes: number[] = [999]
public timeout: number = 2000
public ignoredExtensions: string[] = ["mailto", "tel"]
}
export class Result {
private pages = new Map<string, ResultItem[]>()
private checkedUrls = new Set<string>()
public add(completedCheck: ResultItem, parent: string = "original request") {
// console.log(`${completedCheck.url} : ${completedCheck.status} ${completedCheck.message ? completedCheck.message : ""}`) // TODO
if (this.pages.has(parent)) {
this.pages.get(parent)?.push(completedCheck)
} else {
this.pages.set(parent, [completedCheck])
}
this.checkedUrls.add(completedCheck.url)
}
public isChecked(url: string): boolean {
return this.checkedUrls.has(url)
}
public count(): number {
let count = 0
for (const page of this.pages.entries()) {
count += page[1].length
}
return count
}
public report(): void { // TODO
let allSkipped = 0
let allOks = 0
let allBroken = 0
for (const page of this.pages.entries()) {
console.log(page[0])
let skipped = 0
let oks = 0
let broken = 0
for (const check of page[1]) {
switch (check.status) {
case CheckStatus.OK:
oks++
break
case CheckStatus.NonSuccessCode:
case CheckStatus.GenericError:
case CheckStatus.Timeout:
broken++
break
case CheckStatus.Skipped:
skipped++
break
}
if (check.status != CheckStatus.Skipped) {
console.log(`\t${check.status} : ${check.url}`)
}
}
console.log(`\tOK: ${oks}, skipped: ${skipped}, broken: ${broken}`)
allOks += oks
allSkipped += skipped
allBroken += broken
}
console.log(`OK: ${allOks}, skipped: ${allSkipped}, broken: ${allBroken}`)
}
}
export class ResultItem {
public url = ""
public status = CheckStatus.OK
public message?: string
}
export enum CheckStatus {
OK = "OK",
Skipped = "SKIP",
Timeout = "TIMEOUT",
NonSuccessCode = "ERROR CODE",
GenericError = "UNKNOWN"
}
export enum URLMatchingRule {
......
......@@ -8,22 +8,22 @@ describe("extractURLs", () => {
it("works for <a href=...>", () => {
const result = extractURLs(`<html><a href="${url}">Text</a></html>`, new URLsMatchingSet(URLMatchingRule.AHRef))
expect(result).to.eql([url])
expect(result).to.eql(new Set([url]))
});
it("works for <script src=...>", () => {
const result = extractURLs(`<html><script src="${url}">Text</script></html>`, new URLsMatchingSet(URLMatchingRule.ScriptSrc))
expect(result).to.eql([url])
expect(result).to.eql(new Set([url]))
});
it("works for <link href=...>", () => {
const result = extractURLs(`<html><link href="${url}"></link></html>`, new URLsMatchingSet(URLMatchingRule.LinkHref))
expect(result).to.eql([url])
expect(result).to.eql(new Set([url]))
});
it("works for <img src=...>", () => {
const result = extractURLs(`<html><img src="${url}">Text</img></html>`, new URLsMatchingSet(URLMatchingRule.ImgSrc))
expect(result).to.eql([url])
expect(result).to.eql(new Set([url]))
});
it("works for many rules", () => {
......@@ -36,7 +36,7 @@ describe("extractURLs", () => {
</html>`,
new URLsMatchingSet()
)
expect(result).to.eql(["1", "2", "3", "4"])
expect(result).to.eql(new Set(["1", "2", "3", "4"]))
});
it("does not match unless rule supplied", () => {
......@@ -47,7 +47,7 @@ describe("extractURLs", () => {
</html>`,
new URLsMatchingSet(URLMatchingRule.ImgSrc)
)
expect(result).to.eql([url])
expect(result).to.eql(new Set([url]))
});
it("filters duplicates", () => {
......@@ -59,7 +59,7 @@ describe("extractURLs", () => {
</html>`,
new URLsMatchingSet()
)
expect(result).to.eql([url, "another-url"])
expect(result).to.eql(new Set([url, "another-url"]))
});
it("fails for unknown rule", () => {
......
import { extractURLs, URLMatchingRule, URLsMatchingSet, processURL, CheckStatus, Config } from "../src/extractor"
import { expect, assert } from "chai";
import "mocha";
describe("processURL", function () {
this.timeout(50_000);
const validURL = new URL("https://dbogatov.org")
it("processes non-recursive", async () => {
const result = await processURL(validURL, false, new Config(), new URLsMatchingSet())
assert(result.count() == 1)
// assert(result[0].url === validURL.href)
// assert(result[0].status == CheckStatus.OK)
});
it("processes recursive", async () => {
const result = await processURL(validURL, true, new Config(), new URLsMatchingSet())
result.report()
// assert(result.length == 1)
// assert(result[0].url === validURL.href)
// assert(result[0].status == CheckStatus.OK)
});
});
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment