Finish scraper and scrape for vegetarian food

2024-12-23 21:52:07 +00:00
parent 2cd5440660
commit dd21ae23ed
3 changed files with 55336 additions and 9 deletions
--- a/goodFoodData.csv
+++ b/goodFoodData.csv
--- a/goodFoodData.json
+++ b/goodFoodData.json
--- a/scrape-all.sh
+++ b/scrape-all.sh
@@ -1,30 +1,113 @@
-#!/bin/bash
+#! /usr/bin/env nix-shell
 #! nix-shell -i bash -p htmlq
 search_args=$1
 recipies_per_search=500
 update_cache() {
 	cache=$(curl "$1" -s)
 }
 get_number_of_search_pages() {
 	local results=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=1&${search_args}" -s | jq '.searchResults.totalItems')
 	if [[ $(($results % recipies_per_search)) == 0 ]]; then
 		echo $(($results / $recipies_per_search))
 	else
 		echo $(($results / $recipies_per_search + 1))
 	fi		
 }
 get_main_ingredients() {
 	if [[ -z "$1" ]]; then
 	  local data=$cache
 	else
-	  local data=$1
+	  local data=$(curl $1 -s)
 	fi
-	echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//'
+	echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//' | sed 's/,//'
-	echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}'
+	echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}' | sed 's/,//'
 }
-get_json() {
+get_page_json() { # Unused - but could be helpful for expanding this out layer
 	if [[ -z "$1" ]]; then
 	  local data=$cache
 	else
-	  local data=$1
+	  local data=$(curl $1 -s)
 	fi
 	echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
 }
-update_cache "https://www.bbcgoodfood.com/recipes/vegan-banana-bread"
+get_simple_search_json() {
-get_json
+	local page=$1
-get_main_ingredients
+
 	local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
 	echo "$data" | jq '.searchResults.items[] | {
 		title: .title,
 		url: "bbcgoodfood.com\(.url)",
 		rating: "\(.rating.ratingValue) (\(.rating.ratingCount))",
 		time: .terms[0].display,
 		skill: .terms[1].display }' -c
 }
 get_search_urls() {
 	local page=$1
 	local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
 	echo "$data" | jq -r '.searchResults.items[] | "https://www.bbcgoodfood.com\(.url)"'
 }
 json_results="[]"
 # For each of the search pages...
 for page in $(seq $(get_number_of_search_pages)); do
 # for page in {1..2}; do # For testing only do a few pages
 	# Make an array to store the main ingredients
 	declare -A main_ingredients
 	# Get the search urls and grab their main ingredients
 	urls=$(get_search_urls $page)
 	declare -i count=0
 	for url in $(get_search_urls $page); do
 		ingredients=$(get_main_ingredients $url)
 		main_ingredients[$count]=$(echo "$ingredients" | awk '$1=$1' ORS=' - ' | sed 's/ - $//')
 		count+=1
 	done
 	# Now process each simple_json from the search page adding in the ingredients
 	# and then adding it to the final json array
 	count=0
 	IFS=$'\n' 
 	for result in $(get_simple_search_json $page); do
 		ingredients=${main_ingredients[$count]}
 	    json=$(echo "$result" | jq ". + {\"ingredients\": \"$ingredients\"}")
 	    json_results=$(echo "$json_results" | jq ". + [$json]")
 	    count+=1
 	done
 	unset IFS
 done
 # Print that final json array 
 echo $json_results | jq
 # Save json file
 echo $json_results | jq > goodFoodData.json
 # Save CSV
 echo "Title,Url,Rating,Time,Skill,Ingredients" > goodFoodData.csv
 echo $json_results | jq -r '.[] | [
        .title,
        .url,
        .rating,
        .time,
        .skill,
        .ingredients
 	] | @csv'> goodFoodData.csv