BBCGoodFood-Scrape/scrape-all.sh

133 lines
3.5 KiB
Bash
Raw Permalink Normal View History

#! /usr/bin/env nix-shell
#! nix-shell -i bash -p htmlq jq curl
search_args=$1
2024-12-23 22:37:34 +00:00
throttling=$2
recipies_per_search=50
if [[ -z "$2" ]]; then
throttling=0
fi
2023-09-19 14:43:42 +01:00
update_cache() {
cache=$(curl "$1" -s)
}
get_number_of_search_pages() {
2024-12-24 11:38:26 +00:00
local results
results=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=1&${search_args}" -s | jq '.searchResults.totalItems')
2024-12-24 11:38:26 +00:00
if [[ $((results % recipies_per_search)) == 0 ]]; then
echo $((results / recipies_per_search))
else
2024-12-24 11:38:26 +00:00
echo $((results / recipies_per_search + 1))
fi
}
2023-09-19 14:43:42 +01:00
get_main_ingredients() {
2024-12-24 11:38:26 +00:00
local data
2023-09-19 14:43:42 +01:00
if [[ -z "$1" ]]; then
2024-12-24 11:38:26 +00:00
data=$cache
2023-09-19 14:43:42 +01:00
else
2024-12-24 11:38:26 +00:00
data=$(curl "$1" -s)
2023-09-19 14:43:42 +01:00
fi
echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//' | sed 's/,//'
echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}' | sed 's/,//'
2023-09-19 14:43:42 +01:00
}
get_page_json() { # Unused - but could be helpful for expanding this out layer
2024-12-24 11:38:26 +00:00
local data
2023-09-19 14:43:42 +01:00
if [[ -z "$1" ]]; then
2024-12-24 11:38:26 +00:00
data=$cache
2023-09-19 14:43:42 +01:00
else
2024-12-24 11:38:26 +00:00
data=$(curl "$1" -s)
2023-09-19 14:43:42 +01:00
fi
echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
}
get_simple_search_json() {
local page=$1
2024-12-24 11:38:26 +00:00
local data
data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
echo "$data" | jq '.searchResults.items[] | {
title: .title,
url: "bbcgoodfood.com\(.url)",
rating: "\(.rating.ratingValue) (\(.rating.ratingCount))",
time: .terms[0].display,
skill: .terms[1].display }' -c
}
get_search_urls() {
local page=$1
2024-12-24 11:38:26 +00:00
local data
data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
echo "$data" | jq -r '.searchResults.items[] | "https://www.bbcgoodfood.com\(.url)"'
}
2024-12-23 22:37:34 +00:00
echo "Welcome! Beginning scrape"
json_results="[]"
2024-12-23 22:37:34 +00:00
total_search_pages=$(get_number_of_search_pages)
echo
echo "Scraping $total_search_pages search pages"
echo "Each has a max of $recipies_per_search recipes"
# For each of the search pages...
2024-12-24 11:38:26 +00:00
for page in $(seq "$total_search_pages"); do
# for page in {1..2}; do # For testing only do a few pages
2024-12-23 22:37:34 +00:00
echo "Starting search page $page..."
# Make an array to store the main ingredients
declare -A main_ingredients
# Get the search urls and grab their main ingredients
2024-12-24 11:38:26 +00:00
urls=$(get_search_urls "$page")
declare -i count=0
2024-12-24 11:38:26 +00:00
for url in $urls; do
2024-12-23 22:37:34 +00:00
echo "Recipe $count done"
2024-12-24 11:38:26 +00:00
ingredients=$(get_main_ingredients "$url")
main_ingredients[$count]=$(echo "$ingredients" | awk '$1=$1' ORS=' - ' | sed 's/ - $//')
count+=1
2024-12-24 11:38:26 +00:00
sleep "$throttling"
done
# Now process each simple_json from the search page adding in the ingredients
# and then adding it to the final json array
count=0
IFS=$'\n'
2024-12-24 11:38:26 +00:00
for result in $(get_simple_search_json "$page"); do
ingredients=${main_ingredients[$count]}
json=$(echo "$result" | jq ". + {\"ingredients\": \"$ingredients\"}")
json_results=$(echo "$json_results" | jq ". + [$json]")
count+=1
done
unset IFS
done
# Print that final json array
2024-12-24 11:38:26 +00:00
echo "$json_results" | jq
# Save json file
2024-12-24 11:38:26 +00:00
echo "$json_results" | jq > goodFoodData.json
# Save CSV
echo "Title,Url,Rating,Time,Skill,Ingredients" > goodFoodData.csv
2024-12-24 11:38:26 +00:00
echo "$json_results" | jq -r '.[] | [
.title,
.url,
.rating,
.time,
.skill,
.ingredients
] | @csv'> goodFoodData.csv