Finish scraper and scrape for vegetarian food

This commit is contained in:
CactiChameleon9 2024-12-23 21:52:07 +00:00
parent 2cd5440660
commit dd21ae23ed
3 changed files with 55336 additions and 9 deletions

6138
goodFoodData.csv Normal file

File diff suppressed because it is too large Load Diff

49106
goodFoodData.json Normal file

File diff suppressed because it is too large Load Diff

101
scrape-all.sh Normal file → Executable file
View File

@ -1,30 +1,113 @@
#!/bin/bash
#! /usr/bin/env nix-shell
#! nix-shell -i bash -p htmlq
search_args=$1
recipies_per_search=500
update_cache() {
cache=$(curl "$1" -s)
}
get_number_of_search_pages() {
local results=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=1&${search_args}" -s | jq '.searchResults.totalItems')
if [[ $(($results % recipies_per_search)) == 0 ]]; then
echo $(($results / $recipies_per_search))
else
echo $(($results / $recipies_per_search + 1))
fi
}
get_main_ingredients() {
if [[ -z "$1" ]]; then
local data=$cache
else
local data=$1
local data=$(curl $1 -s)
fi
echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//'
echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}'
echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//' | sed 's/,//'
echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}' | sed 's/,//'
}
get_json() {
get_page_json() { # Unused - but could be helpful for expanding this out layer
if [[ -z "$1" ]]; then
local data=$cache
else
local data=$1
local data=$(curl $1 -s)
fi
echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
}
update_cache "https://www.bbcgoodfood.com/recipes/vegan-banana-bread"
get_json
get_main_ingredients
get_simple_search_json() {
local page=$1
local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
echo "$data" | jq '.searchResults.items[] | {
title: .title,
url: "bbcgoodfood.com\(.url)",
rating: "\(.rating.ratingValue) (\(.rating.ratingCount))",
time: .terms[0].display,
skill: .terms[1].display }' -c
}
get_search_urls() {
local page=$1
local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
echo "$data" | jq -r '.searchResults.items[] | "https://www.bbcgoodfood.com\(.url)"'
}
json_results="[]"
# For each of the search pages...
for page in $(seq $(get_number_of_search_pages)); do
# for page in {1..2}; do # For testing only do a few pages
# Make an array to store the main ingredients
declare -A main_ingredients
# Get the search urls and grab their main ingredients
urls=$(get_search_urls $page)
declare -i count=0
for url in $(get_search_urls $page); do
ingredients=$(get_main_ingredients $url)
main_ingredients[$count]=$(echo "$ingredients" | awk '$1=$1' ORS=' - ' | sed 's/ - $//')
count+=1
done
# Now process each simple_json from the search page adding in the ingredients
# and then adding it to the final json array
count=0
IFS=$'\n'
for result in $(get_simple_search_json $page); do
ingredients=${main_ingredients[$count]}
json=$(echo "$result" | jq ". + {\"ingredients\": \"$ingredients\"}")
json_results=$(echo "$json_results" | jq ". + [$json]")
count+=1
done
unset IFS
done
# Print that final json array
echo $json_results | jq
# Save json file
echo $json_results | jq > goodFoodData.json
# Save CSV
echo "Title,Url,Rating,Time,Skill,Ingredients" > goodFoodData.csv
echo $json_results | jq -r '.[] | [
.title,
.url,
.rating,
.time,
.skill,
.ingredients
] | @csv'> goodFoodData.csv