diff --git a/scrape-all.sh b/scrape-all.sh index 1fe7f82..324303b 100755 --- a/scrape-all.sh +++ b/scrape-all.sh @@ -2,7 +2,12 @@ #! nix-shell -i bash -p htmlq search_args=$1 -recipies_per_search=500 +throttling=$2 +recipies_per_search=50 + +if [[ -z "$2" ]]; then + throttling=0 +fi update_cache() { @@ -62,12 +67,20 @@ get_search_urls() { } +echo "Welcome! Beginning scrape" json_results="[]" +total_search_pages=$(get_number_of_search_pages) + +echo +echo "Scraping $total_search_pages search pages" +echo "Each has a max of $recipies_per_search recipes" + # For each of the search pages... -for page in $(seq $(get_number_of_search_pages)); do +for page in $(seq $total_search_pages); do # for page in {1..2}; do # For testing only do a few pages + echo "Starting search page $page..." # Make an array to store the main ingredients declare -A main_ingredients @@ -76,9 +89,11 @@ for page in $(seq $(get_number_of_search_pages)); do urls=$(get_search_urls $page) declare -i count=0 for url in $(get_search_urls $page); do + echo "Recipe $count done" ingredients=$(get_main_ingredients $url) main_ingredients[$count]=$(echo "$ingredients" | awk '$1=$1' ORS=' - ' | sed 's/ - $//') count+=1 + sleep $throttling done # Now process each simple_json from the search page adding in the ingredients