Add basic single URL scraping

2023-09-19 14:43:42 +01:00
commit 2cd5440660
1 changed files with 30 additions and 0 deletions
--- a/scrape-all.sh
+++ b/scrape-all.sh
@ -0,0 +1,30 @@
 #!/bin/bash
 update_cache() {
 	cache=$(curl "$1" -s)
 }
 get_main_ingredients() {
 	if [[ -z "$1" ]]; then
 	  local data=$cache
 	else
 	  local data=$1
 	fi
 	echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//'
 	echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}'
 }
 get_json() {
 	if [[ -z "$1" ]]; then
 	  local data=$cache
 	else
 	  local data=$1
 	fi
 	echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
 }
 update_cache "https://www.bbcgoodfood.com/recipes/vegan-banana-bread"
 get_json
 get_main_ingredients