Add basic single URL scraping

This commit is contained in:
CactiChameleon9 2023-09-19 14:43:42 +01:00
commit 2cd5440660

30
scrape-all.sh Normal file
View File

@ -0,0 +1,30 @@
#!/bin/bash
update_cache() {
cache=$(curl "$1" -s)
}
get_main_ingredients() {
if [[ -z "$1" ]]; then
local data=$cache
else
local data=$1
fi
echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//'
echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}'
}
get_json() {
if [[ -z "$1" ]]; then
local data=$cache
else
local data=$1
fi
echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
}
update_cache "https://www.bbcgoodfood.com/recipes/vegan-banana-bread"
get_json
get_main_ingredients