Finish scraper and scrape for vegetarian food
This commit is contained in:
parent
2cd5440660
commit
dd21ae23ed
6138
goodFoodData.csv
Normal file
6138
goodFoodData.csv
Normal file
File diff suppressed because it is too large
Load Diff
49106
goodFoodData.json
Normal file
49106
goodFoodData.json
Normal file
File diff suppressed because it is too large
Load Diff
101
scrape-all.sh
Normal file → Executable file
101
scrape-all.sh
Normal file → Executable file
@ -1,30 +1,113 @@
|
|||||||
#!/bin/bash
|
#! /usr/bin/env nix-shell
|
||||||
|
#! nix-shell -i bash -p htmlq
|
||||||
|
|
||||||
|
search_args=$1
|
||||||
|
recipies_per_search=500
|
||||||
|
|
||||||
|
|
||||||
update_cache() {
|
update_cache() {
|
||||||
cache=$(curl "$1" -s)
|
cache=$(curl "$1" -s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get_number_of_search_pages() {
|
||||||
|
local results=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=1&${search_args}" -s | jq '.searchResults.totalItems')
|
||||||
|
|
||||||
|
if [[ $(($results % recipies_per_search)) == 0 ]]; then
|
||||||
|
echo $(($results / $recipies_per_search))
|
||||||
|
else
|
||||||
|
echo $(($results / $recipies_per_search + 1))
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
get_main_ingredients() {
|
get_main_ingredients() {
|
||||||
if [[ -z "$1" ]]; then
|
if [[ -z "$1" ]]; then
|
||||||
local data=$cache
|
local data=$cache
|
||||||
else
|
else
|
||||||
local data=$1
|
local data=$(curl $1 -s)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//'
|
echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//' | sed 's/,//'
|
||||||
echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}'
|
echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}' | sed 's/,//'
|
||||||
}
|
}
|
||||||
|
|
||||||
get_json() {
|
get_page_json() { # Unused - but could be helpful for expanding this out layer
|
||||||
if [[ -z "$1" ]]; then
|
if [[ -z "$1" ]]; then
|
||||||
local data=$cache
|
local data=$cache
|
||||||
else
|
else
|
||||||
local data=$1
|
local data=$(curl $1 -s)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
|
echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
|
||||||
}
|
}
|
||||||
|
|
||||||
update_cache "https://www.bbcgoodfood.com/recipes/vegan-banana-bread"
|
get_simple_search_json() {
|
||||||
get_json
|
local page=$1
|
||||||
get_main_ingredients
|
|
||||||
|
local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
|
||||||
|
|
||||||
|
echo "$data" | jq '.searchResults.items[] | {
|
||||||
|
title: .title,
|
||||||
|
url: "bbcgoodfood.com\(.url)",
|
||||||
|
rating: "\(.rating.ratingValue) (\(.rating.ratingCount))",
|
||||||
|
time: .terms[0].display,
|
||||||
|
skill: .terms[1].display }' -c
|
||||||
|
}
|
||||||
|
|
||||||
|
get_search_urls() {
|
||||||
|
local page=$1
|
||||||
|
|
||||||
|
local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)
|
||||||
|
|
||||||
|
echo "$data" | jq -r '.searchResults.items[] | "https://www.bbcgoodfood.com\(.url)"'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
json_results="[]"
|
||||||
|
|
||||||
|
# For each of the search pages...
|
||||||
|
for page in $(seq $(get_number_of_search_pages)); do
|
||||||
|
# for page in {1..2}; do # For testing only do a few pages
|
||||||
|
|
||||||
|
# Make an array to store the main ingredients
|
||||||
|
declare -A main_ingredients
|
||||||
|
|
||||||
|
# Get the search urls and grab their main ingredients
|
||||||
|
urls=$(get_search_urls $page)
|
||||||
|
declare -i count=0
|
||||||
|
for url in $(get_search_urls $page); do
|
||||||
|
ingredients=$(get_main_ingredients $url)
|
||||||
|
main_ingredients[$count]=$(echo "$ingredients" | awk '$1=$1' ORS=' - ' | sed 's/ - $//')
|
||||||
|
count+=1
|
||||||
|
done
|
||||||
|
|
||||||
|
# Now process each simple_json from the search page adding in the ingredients
|
||||||
|
# and then adding it to the final json array
|
||||||
|
count=0
|
||||||
|
IFS=$'\n'
|
||||||
|
for result in $(get_simple_search_json $page); do
|
||||||
|
ingredients=${main_ingredients[$count]}
|
||||||
|
json=$(echo "$result" | jq ". + {\"ingredients\": \"$ingredients\"}")
|
||||||
|
json_results=$(echo "$json_results" | jq ". + [$json]")
|
||||||
|
count+=1
|
||||||
|
done
|
||||||
|
unset IFS
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
# Print that final json array
|
||||||
|
echo $json_results | jq
|
||||||
|
|
||||||
|
# Save json file
|
||||||
|
echo $json_results | jq > goodFoodData.json
|
||||||
|
|
||||||
|
# Save CSV
|
||||||
|
echo "Title,Url,Rating,Time,Skill,Ingredients" > goodFoodData.csv
|
||||||
|
echo $json_results | jq -r '.[] | [
|
||||||
|
.title,
|
||||||
|
.url,
|
||||||
|
.rating,
|
||||||
|
.time,
|
||||||
|
.skill,
|
||||||
|
.ingredients
|
||||||
|
] | @csv'> goodFoodData.csv
|
Loading…
x
Reference in New Issue
Block a user