Skip to content

Web Scraping Techniques | JavaScript | Cheatsheet


Array.from(document.links).forEach(({ href }) => console.log(href));
Array.from(document.links, ({ href }) => href).forEach(console.log);
[...document.getElementsByTagName('a')].forEach(a => console.log(a.href));
Array.from(document.getElementsByTagName('a'), a => a.href).forEach(url => console.log(url));
var urls = Array.from(document.getElementsByTagName('a'));
for (var url of urls) {
    console.log(url.href);
}

Capture Emails Using Regular Expression

const regex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g;
const html = document.documentElement.innerHTML;
let match;
while ((match = regex.exec(html))) {
  console.log(match[0]);
}

Capture All Images

Array.from(document.images).forEach(({ src }) => console.log(src));

Capture Stylesheets

Array.from(document.styleSheets).forEach(({ href }) => console.log(href));
Array.from(document.links).filter(a => a.hostname === location.hostname).forEach(({ href }) => console.log(href));
Array.from(document.links).filter(a => a.hostname !== location.hostname).forEach(({ href }) => console.log(href));

Capture Unique URLs

let uniqueURLs = new Set(Array.from(document.links).map(({ href }) => href));
uniqueURLs.forEach(url => console.log(url));
Array.from(document.links).filter(a => a.href.endsWith('.pdf')).forEach(({ href }) => console.log(href));
Array.from(document.querySelectorAll('a[download]')).forEach(({ href }) => console.log(href));
Array.from(document.querySelectorAll('a[href^="mailto:"]')).forEach(({ href }) => console.log(href));
Array.from(document.querySelectorAll('a[href^="tel:"]')).forEach(({ href }) => console.log(href));
Array.from(document.links).filter(a => a.innerText.includes('text')).forEach(({ href }) => console.log(href));
Array.from(document.querySelectorAll('a[href^="#"]')).forEach(({ href }) => console.log(href));

Capture Chrome Tabs URLs

chrome.tabs.query({}, function(tabs) {
  tabs.forEach(tab => console.log(tab.url));
});

Capture Iframe Sources

Array.from(document.querySelectorAll

('iframe')).forEach(({ src }) => console.log(src));

Advanced Techniques

Capture All Images urls to a tab

const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank">${url}</a>`);
window.open().document.write('<ul>' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');

Capture all images to a tab and show a preview in a fixed size

const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" width="50" height="50"></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');

Capture all images to a tab and show a preview in default size

const images = Array.from(document.images);
const imageUrls = images.map((image) => image.src);
const anchorTags = imageUrls.map((url) => `<a href="${url}" target="_blank"><img src="${url}" ></a>`);
const newTab = window.open();
newTab.document.write('<ul style="list-style-type:none; padding: 0;">' + anchorTags.map((tag) => `<li>${tag}</li>`).join('') + '</ul>');

Capture Sources of Elements with src Attributes

var elementsWithSrc = document.querySelectorAll('img, script, iframe');
for (var i = 0; i < elementsWithSrc.length; i++) {
  var element = elementsWithSrc[i];
  var src = element.getAttribute('src');
  if (src) {
    console.log(src);
  }
}

Capture Elements with XPath

const xpathResult = document.evaluate('//a', document, null, XPathResult.ANY_TYPE, null);
let node = xpathResult.iterateNext();
while (node) {
  console.log(node.href);
  node = xpathResult.iterateNext();
}

Web Crawler with Elapsed Time

const crawledUrls = new Set();
const pendingUrls = [window.location.href];

async function crawl() {
  const startTime = new Date().getTime();

  while (pendingUrls.length) {
    const url = pendingUrls.pop();
    if (!crawledUrls.has(url)) {
      console.log(`Crawling ${url}`);
      try {
        const response = await fetch(url);
        const text = await response.text();
        const doc = new DOMParser().parseFromString(text, 'text/html');
        const anchors = doc.getElementsByTagName('a');
        for (const a of anchors) {
          const href = a.href;
          if (!crawledUrls.has(href) && !pendingUrls.includes(href)) {
            pendingUrls.push(href);
          }
        }
      } catch (e) {
        console.error(`Failed to crawl "${url}": ${e}`);
      }
      crawledUrls.add(url);
    }
  }

  const endTime = new Date().getTime();
  const elapsedTime = endTime - startTime;
  console.log('Finished crawling', crawledUrls.size, 'URLs');
  console.log('Elapsed time:', elapsedTime, 'ms');
}

crawl();