Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 63 additions & 95 deletions lib/node-linkchecker.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,19 @@
'use strict';

// Pseudo-constants:
var DEFAULT_OPTIONS = {
const DEFAULT_OPTIONS = {
schemes: ["http:", "https:"],
userAgent: "node-linchecker",
robotExclusion: true,
fragments: true
};

var ua = require("superagent"),
whacko = require("whacko"),
const ua = require("superagent"),
{JSDOM} = require('jsdom'),
url = require("url"),
chalk = require("chalk"),
Promise = require('promise'),
options = JSON.parse(JSON.stringify(DEFAULT_OPTIONS)),
list,
result;
Promise = require('promise');

var linksAttr = {
const linksAttr = {
background: ['body'],
cite: ['blockquote', 'del', 'ins', 'q'],
data: ['object'],
Expand All @@ -29,72 +25,48 @@ var linksAttr = {
poster: ['video'],
pluginspage: ['embed'],
pluginurl: ['embed'],
src: ['audio', 'embed', 'frame', 'iframe', 'img', 'input', 'script', 'source', 'track', 'video'],
src: ['audio', 'embed', 'frame', 'iframe', 'img', 'input', 'script', 'source', 'track', 'video']
};

function isSchemeAllowed(url) {
for (var i = 0 ; i < options.schemes.length; i++) {
var scheme = options.schemes[i];
if (url.protocol === scheme) {
return true;
}
}
return false;
}

function sortURLs(a, b) {
if (a.href < b.href) return -1
else if (a.href > b.href) return 1
else return 0;
}

// sort array of URL and remove duplicates
function sortUniq(arr) {
return arr.sort(sortURLs).filter(function(item, pos, a) {
return !pos || item.href != a[pos - 1].href;
});
}

function extract(baseURL, $) {
for (var attr in linksAttr) {
var elements = linksAttr[attr].map(function(tag) {return tag+'['+attr+']';}).join(',');
$(elements).each(function() {
if ($(this) !== undefined) {
var resolvedUrl = url.parse(url.resolve(baseURL, $(this).attr(attr)));
if (isSchemeAllowed(resolvedUrl)) {
if (resolvedUrl.hash === null) {
list.links.push(resolvedUrl);
} else {
list.fragments.push(resolvedUrl);
}
function extract(baseURL, doc, options) {
const list = {
links : new Set(),
fragments: new Set()
};
for (let attr in linksAttr) {
const elementSel = linksAttr[attr].map(tag => `${tag}[${attr}]`).join(',');
for (let el of doc.querySelectorAll(elementSel)) {
const resolvedUrl = url.parse(url.resolve(baseURL, el.getAttribute(attr)));
if (options.schemes.includes(resolvedUrl.protocol)) {
if (resolvedUrl.hash === null) {
list.links.add(resolvedUrl);
} else {
list.fragments.add(resolvedUrl);
}
}
});
}
}
return list;
}

function checkLink(link, method) {
function checkLink(link, method, options) {
var req = (method==='get') ? ua.get(link.href) : ua.head(link.href);
req.redirects(3);
const result = {
brokenLinks : []
};

return new Promise(function(resolve, reject) {
req.set("User-Agent", options.userAgent)
.on('error', function(err) {
reject(err);
})
.end(function(err, res) {
if (!res) {
result.brokenLinks.push({link: link.href, status: 'unknown'});
const status = res ? res.status : 'unknown';
if (status !== 200) {
result.brokenLinks.push({link: link.href, status});
}
else {
if (res.headers.location) { // redirect
// superagent doesn't follow the redirect when it's doing a HEAD
// https://github.com/visionmedia/superagent/issues/669
checkLink(link, 'get');
}
else if (res.status !== 200) {
result.brokenLinks.push({link: link.href, status: res.status});
}
}
resolve();
resolve(result);
});
});
}
Expand All @@ -106,14 +78,18 @@ function hashEscaper(hash) {
});
}

function checkFragmentsList(list) {
var fragmentsList = {};
list.forEach(function(link) {
var fragmentLessURL = link.protocol + '//' + link.host + link.pathname;
function checkFragmentsList(list, options) {
const fragmentsList = {};
const result = {
brokenFragments: []
};

for (const link of list) {
const fragmentLessURL = link.protocol + '//' + link.host + link.pathname;
if (!fragmentsList[fragmentLessURL])
fragmentsList[fragmentLessURL] = [];
fragmentsList[fragmentLessURL].push(link.hash);
});
}

var keys = Object.keys(fragmentsList);
return new Promise(function(resolve, reject) {
Expand All @@ -125,41 +101,25 @@ function checkFragmentsList(list) {
reject(err);
})
.end(function(err, res) {
var $ = whacko.load(res.text);
const dom = new JSDOM(res.text);
fragmentsList[keys[index]].forEach(function(hash) {
var $el = $(hashEscaper(hash).join(",")).first();
if (!$el.length) {
var el = dom.window.document.querySelector(hashEscaper(hash).join(","));
if (!el) {
result.brokenFragments.push({link: keys[index] + hash, status: res.status});
}
});
processLink(index + 1);
});
} else {
resolve();
resolve(result);
}
}
};
processLink(0);
});
}



exports.check = function(url, opts) {
options = JSON.parse(JSON.stringify(DEFAULT_OPTIONS));
if (opts) {
if (opts.hasOwnProperty("userAgent")) options.userAgent = opts.userAgent;
if (opts.hasOwnProperty("schemes")) options.schemes = opts.schemes;
if (opts.hasOwnProperty("robotExclusion")) options.robotExclusion = opts.robotExclusion;
if (opts.hasOwnProperty("fragments")) options.fragments = opts.fragments;
}
list = {
links : [],
fragments: []
},
result = {
brokenLinks : [],
brokenFragments: []
};
const options = {...DEFAULT_OPTIONS, ...opts};

return new Promise(function(resolve, reject) {
ua.get(url)
Expand All @@ -168,21 +128,29 @@ exports.check = function(url, opts) {
reject(err);
})
.end(function(err, res) {
var $ = whacko.load(res.text),
const dom = new JSDOM(res.text),
baseURL = (res.redirects.length > 0) ? res.redirects[res.redirects.length - 1] : url;
extract(baseURL, $);
const list = extract(baseURL, dom.window.document, options);
var p = [];
// links
sortUniq(list.links).forEach(function(link) {
p.push(checkLink(link));
});
for (const link of list.links) {
p.push(checkLink(link, 'head', options));
}

// fragments
if (options.fragments) {
p.push(checkFragmentsList(sortUniq(list.fragments)));
p.push(checkFragmentsList(list.fragments, options));
}
Promise.all(p).then(function() {
resolve(result);
Promise.all(p).then(function(results) {
const flatResults = results.reduce(
(a,b) =>
{ return {
brokenLinks: a.brokenLinks.concat(b.brokenLinks || []),
brokenFragments: a.brokenFragments.concat(b.brokenFragments || [])
};
},
{brokenLinks:[], brokenFragments: []});
resolve(flatResults);
});
});
});
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
},
"dependencies": {
"chalk": "^1.1.1",
"jsdom": "^11.6.0",
"promise": "^7.1.1",
"resolve-url": "^0.2.1",
"superagent": "^1.7.2",
"whacko": "^0.19.1"
"superagent": "^1.7.2"
},
"devDependencies": {
"chai": "3.5",
Expand Down