Finish scraper and scrape for vegetarian food

2024-12-23 21:52:07 +00:00 · 2024-12-23 21:52:07 +00:00 · dd21ae23ed
commit dd21ae23ed
parent 2cd5440660
3 changed files with 55336 additions and 9 deletions
--- a/goodFoodData.csv
+++ b/goodFoodData.csv
--- a/goodFoodData.json
+++ b/goodFoodData.json
--- a/scrape-all.sh
+++ b/scrape-all.sh
@ -1,30 +1,113 @@
-#!/bin/bash
+#! /usr/bin/env nix-shell
+#! nix-shell -i bash -p htmlq
+
+search_args=$1
+recipies_per_search=500
+

 update_cache() {
 	cache=$(curl "$1" -s)
 }

+get_number_of_search_pages() {
+	local results=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=1&${search_args}" -s | jq '.searchResults.totalItems')
+	
+	if [[ $(($results % recipies_per_search)) == 0 ]]; then
+		echo $(($results / $recipies_per_search))
+	else
+		echo $(($results / $recipies_per_search + 1))
+	fi		
+}
+
 get_main_ingredients() {
 	if [[ -z "$1" ]]; then
 	  local data=$cache
 	else
-	  local data=$1
+	  local data=$(curl $1 -s)
 	fi
 	
-	echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//'
-	echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}'
+	echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//' | sed 's/,//'
+	echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}' | sed 's/,//'
 }

-get_json() {
+get_page_json() { # Unused - but could be helpful for expanding this out layer
 	if [[ -z "$1" ]]; then
 	  local data=$cache
 	else
-	  local data=$1
+	  local data=$(curl $1 -s)
 	fi
 	
 	echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
 }

-update_cache "https://www.bbcgoodfood.com/recipes/vegan-banana-bread"
-get_json
-get_main_ingredients
+get_simple_search_json() {
+	local page=$1
+
+	local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
+
+	echo "$data" | jq '.searchResults.items[] | {
+		title: .title,
+		url: "bbcgoodfood.com\(.url)",
+		rating: "\(.rating.ratingValue) (\(.rating.ratingCount))",
+		time: .terms[0].display,
+		skill: .terms[1].display }' -c
+}
+
+get_search_urls() {
+	local page=$1
+
+	local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
+
+	echo "$data" | jq -r '.searchResults.items[] | "https://www.bbcgoodfood.com\(.url)"'
+}
+
+
+
+json_results="[]"
+
+# For each of the search pages...
+for page in $(seq $(get_number_of_search_pages)); do
+# for page in {1..2}; do # For testing only do a few pages
+
+	# Make an array to store the main ingredients
+	declare -A main_ingredients
+	
+	# Get the search urls and grab their main ingredients
+	urls=$(get_search_urls $page)
+	declare -i count=0
+	for url in $(get_search_urls $page); do
+		ingredients=$(get_main_ingredients $url)
+		main_ingredients[$count]=$(echo "$ingredients" | awk '$1=$1' ORS=' - ' | sed 's/ - $//')
+		count+=1
+	done
+
+	# Now process each simple_json from the search page adding in the ingredients
+	# and then adding it to the final json array
+	count=0
+	IFS=$'\n' 
+	for result in $(get_simple_search_json $page); do
+		ingredients=${main_ingredients[$count]}
+	    json=$(echo "$result" | jq ". + {\"ingredients\": \"$ingredients\"}")
+	    json_results=$(echo "$json_results" | jq ". + [$json]")
+	    count+=1
+	done
+	unset IFS
+	
+done
+
+# Print that final json array 
+echo $json_results | jq
+
+# Save json file
+echo $json_results | jq > goodFoodData.json
+
+# Save CSV
+echo "Title,Url,Rating,Time,Skill,Ingredients" > goodFoodData.csv
+echo $json_results | jq -r '.[] | [
+        .title,
+        .url,
+        .rating,
+        .time,
+        .skill,
+        .ingredients
+	] | @csv'> goodFoodData.csv