#!/bin/sh # SPDX-License-Identifier: GPL-3.0-or-later # SPDX-FileCopyrightText: 2024 Jonas Smedegaard # iterate through host list load web page for each, pause, and gather data # # Requires sway, firefox-esr, jq # Recommends (for postprocessing): heif-examples, poppler-utils # break on error set -eu # top 1000 domains # * go to # * select "Denmark" # * download json_file="ranked_domains.json" # count Firefox windows firefox_windows() { swaymsg -t get_tree | jq '[recurse(.nodes[]? | .nodes[]?)] | map(select(.app_id == "firefox-esr")) | length' } # count already opened Firefox windows baseline_windows=$(firefox_windows) # iterate through top domains and collect data about each jq -c '.[]' "$json_file" | while read -r item; do domain=$(echo "$item" | jq -r '.domain') pos=$(echo "$item" | jq -r '.position') # skip if screenshot already exist for this domain [ ! -e "$pos.png" ] || continue # load front page of www host at domain into Firefox firefox --new-window "https://www.$domain" # wait until firefox window is closed while true; do sleep 1 if [ "$(firefox_windows)" -eq "$baseline_windows" ]; then break fi done # collect PNG screenshot # * use tool grimshot find ~/ -mindepth 1 -maxdepth 1 -name '*.png' -exec mv '{}' "$pos.png" ';' # collect HAR network timing data # * Open debugger window: F12 # * Select pane "Network" # * From rightmost pane, select "Save all as HAR" find ~/data -mindepth 1 -maxdepth 1 -name '*.har' -exec mv '{}' "$pos.har" ';' # collect data: PNG screenshot, HAR network timing and PDF screenshot # * use plugin find ~/data -mindepth 1 -maxdepth 1 -name '*' -exec mv '{}' "$pos.pdf" ';' done # Post-processing to extract text strings and compress images: #find -name '*.pdf' -type f -exec pdftotext -raw '{}' \; #find -name '*.png' -type f -exec heif-enc -A '{}' \;