BBCGoodFood-Scrape/scrape-all.sh

#! /usr/bin/env nix-shell
#! nix-shell -i bash -p htmlq

search_args=$1
throttling=$2
recipies_per_search=50

if [[ -z "$2" ]]; then
	throttling=0
fi


update_cache() {
	cache=$(curl "$1" -s)
}

get_number_of_search_pages() {
	local results=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=1&${search_args}" -s | jq '.searchResults.totalItems')
	
	if [[ $(($results % recipies_per_search)) == 0 ]]; then
		echo $(($results / $recipies_per_search))
	else
		echo $(($results / $recipies_per_search + 1))
	fi		
}

get_main_ingredients() {
	if [[ -z "$1" ]]; then
	  local data=$cache
	else
	  local data=$(curl $1 -s)
	fi
	
	echo "$data" | htmlq -p | grep '<a class="link link--styled" data-component="Link" href="/glossary/.*">.*</a>' | sed 's/^.*<a class="link link--styled" data-component="Link" href=".*">//' | sed 's/<\/a>.*//' | sed 's/,//'
	echo "$data" | htmlq -p | grep '<!-- -->.*<!-- -->' | sed 's/<!-- -->/@/g' | awk -F@ '{print $2}' | sed 's/,//'
}

get_page_json() { # Unused - but could be helpful for expanding this out layer
	if [[ -z "$1" ]]; then
	  local data=$cache
	else
	  local data=$(curl $1 -s)
	fi
	
	echo "$data" | htmlq -p | grep '<script data-testid="page-schema" type="application/ld+json">' -A1 | tail -n1 | sed 's/<\/script>//' | jq
}

get_simple_search_json() {
	local page=$1

	local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)

	echo "$data" | jq '.searchResults.items[] | {
		title: .title,
		url: "bbcgoodfood.com\(.url)",
		rating: "\(.rating.ratingValue) (\(.rating.ratingCount))",
		time: .terms[0].display,
		skill: .terms[1].display }' -c
}

get_search_urls() {
	local page=$1

	local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)

	echo "$data" | jq -r '.searchResults.items[] | "https://www.bbcgoodfood.com\(.url)"'
}


echo "Welcome! Beginning scrape"

json_results="[]"

total_search_pages=$(get_number_of_search_pages)

echo
echo "Scraping $total_search_pages search pages"
echo "Each has a max of $recipies_per_search recipes"

# For each of the search pages...
for page in $(seq $total_search_pages); do
# for page in {1..2}; do # For testing only do a few pages
	echo "Starting search page $page..."

	# Make an array to store the main ingredients
	declare -A main_ingredients
	
	# Get the search urls and grab their main ingredients
	urls=$(get_search_urls $page)
	declare -i count=0
	for url in $(get_search_urls $page); do
		echo "Recipe $count done"
		ingredients=$(get_main_ingredients $url)
		main_ingredients[$count]=$(echo "$ingredients" | awk '$1=$1' ORS=' - ' | sed 's/ - $//')
		count+=1
		sleep $throttling
	done

	# Now process each simple_json from the search page adding in the ingredients
	# and then adding it to the final json array
	count=0
	IFS=$'\n' 
	for result in $(get_simple_search_json $page); do
		ingredients=${main_ingredients[$count]}
	    json=$(echo "$result" | jq ". + {\"ingredients\": \"$ingredients\"}")
	    json_results=$(echo "$json_results" | jq ". + [$json]")
	    count+=1
	done
	unset IFS
	
done

# Print that final json array 
echo $json_results | jq

# Save json file
echo $json_results | jq > goodFoodData.json

# Save CSV
echo "Title,Url,Rating,Time,Skill,Ingredients" > goodFoodData.csv
echo $json_results | jq -r '.[] | [
        .title,
        .url,
        .rating,
        .time,
        .skill,
        .ingredients
	] | @csv'> goodFoodData.csv
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`#! /usr/bin/env nix-shell`
			`#! nix-shell -i bash -p htmlq`

			`search_args=$1`
Add some throttling and verbose 2024-12-23 22:37:34 +00:00			`throttling=$2`
			`recipies_per_search=50`

			`if [[ -z "$2" ]]; then`
			`throttling=0`
			`fi`
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00
Add basic single URL scraping 2023-09-19 14:43:42 +01:00
			`update_cache() {`
			`cache=$(curl "$1" -s)`
			`}`

Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`get_number_of_search_pages() {`
			`local results=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=1&${search_args}" -s \| jq '.searchResults.totalItems')`

			`if [[ $(($results % recipies_per_search)) == 0 ]]; then`
			`echo $(($results / $recipies_per_search))`
			`else`
			`echo $(($results / $recipies_per_search + 1))`
			`fi`
			`}`

Add basic single URL scraping 2023-09-19 14:43:42 +01:00			`get_main_ingredients() {`
			`if [[ -z "$1" ]]; then`
			`local data=$cache`
			`else`
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`local data=$(curl $1 -s)`
Add basic single URL scraping 2023-09-19 14:43:42 +01:00			`fi`

Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`echo "$data" \| htmlq -p \| grep '<a class="link link--styled" data-component="Link" href="/glossary/.">.</a>' \| sed 's/^.<a class="link link--styled" data-component="Link" href=".">//' \| sed 's/<\/a>.*//' \| sed 's/,//'`
			`echo "$data" \| htmlq -p \| grep '<!-- -->.*<!-- -->' \| sed 's/<!-- -->/@/g' \| awk -F@ '{print $2}' \| sed 's/,//'`
Add basic single URL scraping 2023-09-19 14:43:42 +01:00			`}`

Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`get_page_json() { # Unused - but could be helpful for expanding this out layer`
Add basic single URL scraping 2023-09-19 14:43:42 +01:00			`if [[ -z "$1" ]]; then`
			`local data=$cache`
			`else`
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`local data=$(curl $1 -s)`
Add basic single URL scraping 2023-09-19 14:43:42 +01:00			`fi`

			`echo "$data" \| htmlq -p \| grep '<script data-testid="page-schema" type="application/ld+json">' -A1 \| tail -n1 \| sed 's/<\/script>//' \| jq`
			`}`

Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`get_simple_search_json() {`
			`local page=$1`

			`local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)`

			`echo "$data" \| jq '.searchResults.items[] \| {`
			`title: .title,`
			`url: "bbcgoodfood.com\(.url)",`
			`rating: "\(.rating.ratingValue) (\(.rating.ratingCount))",`
			`time: .terms[0].display,`
			`skill: .terms[1].display }' -c`
			`}`

			`get_search_urls() {`
			`local page=$1`

			`local data=$(curl "https://www.bbcgoodfood.com/api/search-frontend/search?limit=${recipies_per_search}&page=${page}&${search_args}" -s)`

			`echo "$data" \| jq -r '.searchResults.items[] \| "https://www.bbcgoodfood.com\(.url)"'`
			`}`


Add some throttling and verbose 2024-12-23 22:37:34 +00:00			`echo "Welcome! Beginning scrape"`
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00
			`json_results="[]"`

Add some throttling and verbose 2024-12-23 22:37:34 +00:00			`total_search_pages=$(get_number_of_search_pages)`

			`echo`
			`echo "Scraping $total_search_pages search pages"`
			`echo "Each has a max of $recipies_per_search recipes"`

Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`# For each of the search pages...`
Add some throttling and verbose 2024-12-23 22:37:34 +00:00			`for page in $(seq $total_search_pages); do`
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`# for page in {1..2}; do # For testing only do a few pages`
Add some throttling and verbose 2024-12-23 22:37:34 +00:00			`echo "Starting search page $page..."`
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00
			`# Make an array to store the main ingredients`
			`declare -A main_ingredients`

			`# Get the search urls and grab their main ingredients`
			`urls=$(get_search_urls $page)`
			`declare -i count=0`
			`for url in $(get_search_urls $page); do`
Add some throttling and verbose 2024-12-23 22:37:34 +00:00			`echo "Recipe $count done"`
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`ingredients=$(get_main_ingredients $url)`
			`main_ingredients[$count]=$(echo "$ingredients" \| awk '$1=$1' ORS=' - ' \| sed 's/ - $//')`
			`count+=1`
Add some throttling and verbose 2024-12-23 22:37:34 +00:00			`sleep $throttling`
Finish scraper and scrape for vegetarian food 2024-12-23 21:52:07 +00:00			`done`

			`# Now process each simple_json from the search page adding in the ingredients`
			`# and then adding it to the final json array`
			`count=0`
			`IFS=$'\n'`
			`for result in $(get_simple_search_json $page); do`
			`ingredients=${main_ingredients[$count]}`
			`json=$(echo "$result" \| jq ". + {\"ingredients\": \"$ingredients\"}")`
			`json_results=$(echo "$json_results" \| jq ". + [$json]")`
			`count+=1`
			`done`
			`unset IFS`

			`done`

			`# Print that final json array`
			`echo $json_results \| jq`

			`# Save json file`
			`echo $json_results \| jq > goodFoodData.json`

			`# Save CSV`
			`echo "Title,Url,Rating,Time,Skill,Ingredients" > goodFoodData.csv`
			`echo $json_results \| jq -r '.[] \| [`
			`.title,`
			`.url,`
			`.rating,`
			`.time,`
			`.skill,`
			`.ingredients`
			`] \| @csv'> goodFoodData.csv`