Web Scraping Techniques | JavaScript | Cheatsheet
Capture All Links
Array.from(document.links).forEach(({ href }) => console.log(href));
Capture All Links (Alternative)
Array.from(document.links, ({ href }) => href).forEach(console.log);
Capture Links Using getElementsByTagName
[...document.getElementsByTagName('a')].forEach(a => console.log(a.href));
Capture Links Using getElementsByTagName (Alternative)
Array.from(document.getElementsByTagName('a'), a => a.href).forEach(url => console.log(url));
Capture Links Using a for Loop
var urls = Array.from(document.getElementsByTagName('a'));
for (var url of urls) {
console.log(url.href);
}
Capture Emails Using Regular Expression
const regex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
const html = document.documentElement.innerHTML;
let match;
while ((match = regex.exec(html))) {
console.log(match[0]);
}
Capture All Images
Array.from(document.images).forEach(({ src }) => console.log(src));
Capture Stylesheets
Array.from(document.styleSheets).forEach(({ href }) => console.log(href));
Capture Internal Links
Array.from(document.links).filter(a => a.hostname === location.hostname).forEach(({ href }) => console.log(href));
Capture External Links
Array.from(document.links).filter(a => a.hostname !== location.hostname).forEach(({ href }) => console.log(href));
Capture Unique URLs
let uniqueURLs = new Set(Array.from(document.links).map(({ href }) => href));
uniqueURLs.forEach(url => console.log(url));
Capture PDF Links
Array.from(document.links).filter(a => a.href.endsWith('.pdf')).forEach(({ href }) => console.log(href));
Capture Download Links
Array.from(document.querySelectorAll('a[download]')).forEach(({ href }) => console.log(href));
Capture Mailto Links
Array.from(document.querySelectorAll('a[href^="mailto:"]')).forEach(({ href }) => console.log(href));
Capture Tel Links
Array.from(document.querySelectorAll('a[href^="tel:"]')).forEach(({ href }) => console.log(href));
Capture Links with Specific Text
Array.from(document.links).filter(a => a.innerText.includes('text')).forEach(({ href }) => console.log(href));
Capture Anchor Links
Array.from(document.querySelectorAll('a[href^="#"]')).forEach(({ href }) => console.log(href));
Capture Chrome Tabs URLs
chrome.tabs.query({}, function(tabs) {
tabs.forEach(tab => console.log(tab.url));
});
Capture Iframe Sources
Array.from(document.querySelectorAll
('iframe')).forEach(({ src }) => console.log(src));
Advanced Techniques
Capture All Images urls to a tab
const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank">${url}</a>`);
window.open().document.write('<ul>' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');
Capture all images to a tab and show a preview in a fixed size
const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" width="50" height="50"></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');
Capture all images to a tab and show a preview in default size
const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" ></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');
Capture Sources of Elements with src Attributes
var elementsWithSrc = document.querySelectorAll('img, script, iframe');
for (var i = 0; i < elementsWithSrc.length; i++) {
var element = elementsWithSrc[i];
var src = element.getAttribute('src');
if (src) {
console.log(src);
}
}
Capture Elements with XPath
const xpathResult = document.evaluate('//a', document, null, XPathResult.ANY_TYPE, null);
let node = xpathResult.iterateNext();
while (node) {
console.log(node.href);
node = xpathResult.iterateNext();
}
Web Crawler with Elapsed Time
const crawledUrls = new Set();
const pendingUrls = [window.location.href];
async function crawl() {
const startTime = new Date().getTime();
while (pendingUrls.length) {
const url = pendingUrls.pop();
if (!crawledUrls.has(url)) {
console.log(`Crawling ${url}`);
try {
const response = await fetch(url);
const text = await response.text();
const doc = new DOMParser().parseFromString(text, 'text/html');
const anchors = doc.getElementsByTagName('a');
for (const a of anchors) {
const href = a.href;
if (!crawledUrls.has(href) && !pendingUrls.includes(href)) {
pendingUrls.push(href);
}
}
} catch (e) {
console.error(`Failed to crawl "${url}": ${e}`);
}
crawledUrls.add(url);
}
}
const endTime = new Date().getTime();
const elapsedTime = endTime - startTime;
console.log('Finished crawling', crawledUrls.size, 'URLs');
console.log('Elapsed time:', elapsedTime, 'ms');
}
crawl();