| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- #!/usr/bin/env bash
- set -euo pipefail
- echo "Starting run at $(date -u +"%Y-%m-%d %H:%M:%S")"
- echo "Running schema migrations…"
- ruby /app/lib/migrate.rb
- ONLY_LIST="${ONLY:-}" # e.g. "meandervalley" or "kentish,break_oday"
- SKIP_LIST="${SKIP:-}" # e.g. "hobartcity,latrobe"
- DEBUG_FLAG="${DEBUG:-}" # pass through to scrapers
- DRY_FLAG="${DRY_RUN:-}" # pass through to scrapers
- shopt -s nullglob
- SCRIPTS=(/app/scrapers/*.rb)
- should_run() {
- local name="$1"
- if [[ -n "$ONLY_LIST" ]]; then
- IFS=',' read -ra arr <<< "$ONLY_LIST"
- local pick
- for pick in "${arr[@]}"; do
- pick="${pick// /}"
- if [[ "$name" == "$pick" ]]; then
- echo "1"
- return
- fi
- done
- echo "0"
- return
- fi
- if [[ -n "$SKIP_LIST" ]]; then
- IFS=',' read -ra arr <<< "$SKIP_LIST"
- local skip
- for skip in "${arr[@]}"; do
- skip="${skip// /}"
- if [[ "$name" == "$skip" ]]; then
- echo "0"
- return
- fi
- done
- fi
- echo "1"
- }
- count=0
- total_saved=0
- total_warns=0
- # Each entry: "name|saved|warns|status"
- SUMMARY=()
- for f in "${SCRIPTS[@]}"; do
- name="$(basename "$f" .rb)"
- table="da_${name}"
- if [[ "$(should_run "$name")" != "1" ]]; then
- continue
- fi
- echo ""
- echo "Running ${name} -> table ${table}"
- tmpfile=$(mktemp /tmp/scraper_XXXXXX.log)
- # Run scraper; merge stderr into stdout so tee captures both.
- # Disable pipefail temporarily so a non-zero ruby exit doesn't abort the loop.
- set +e
- TABLE_NAME="$table" DEBUG="$DEBUG_FLAG" DRY_RUN="$DRY_FLAG" ruby "$f" 2>&1 | tee "$tmpfile"
- ruby_exit=${PIPESTATUS[0]}
- set -e
- if [[ $ruby_exit -ne 0 ]]; then
- echo " [run_all] scraper exited with code ${ruby_exit}"
- fi
- # --- Parse summary fields from captured output ---
- # "Saved N item(s)" or "saved N" — case-insensitive, last occurrence wins.
- # Fallback: count "Upserted" lines (every scraper prints one per DB write).
- saved=$(grep -oiE 'saved [0-9]+' "$tmpfile" | tail -1 | grep -oE '[0-9]+' || true)
- if [[ -z "$saved" || "$saved" == "0" ]]; then
- upsert_count=$(grep -cE '^(Upserted| Upserted)' "$tmpfile" 2>/dev/null || true)
- [[ "${upsert_count:-0}" -gt 0 ]] && saved="$upsert_count"
- fi
- saved="${saved:-0}"
- # Count WARN lines (from Log.warn)
- warns=$(grep -c '^\s*WARN' "$tmpfile" || true)
- warns="${warns:-0}"
- # Determine status
- if [[ $ruby_exit -ne 0 ]]; then
- status="ERROR"
- elif grep -qiE 'cloudflare|blocked by|challenge page' "$tmpfile" 2>/dev/null; then
- status="blocked"
- elif [[ $warns -gt 0 ]]; then
- status="warn"
- else
- status="ok"
- fi
- rm -f "$tmpfile"
- SUMMARY+=("${name}|${saved}|${warns}|${status}")
- total_saved=$((total_saved + saved))
- total_warns=$((total_warns + warns))
- count=$((count+1))
- done
- finish_time=$(date -u +"%Y-%m-%d %H:%M:%S")
- # ---------------------------------------------------------------------------
- # Summary table
- # ---------------------------------------------------------------------------
- echo ""
- echo "========================================================================"
- printf " SCRAPE SUMMARY — finished %s UTC\n" "$finish_time"
- echo "========================================================================"
- printf " %-32s %6s %5s %s\n" "Council" "Saved" "Warns" "Status"
- echo " ------------------------------------------------------------------------"
- for entry in "${SUMMARY[@]}"; do
- IFS='|' read -r n s w st <<< "$entry"
- # Colour-code status when output is a terminal; plain text otherwise
- if [[ -t 1 ]]; then
- case "$st" in
- ok) colour="\033[0;32m" ;; # green
- warn) colour="\033[0;33m" ;; # yellow
- blocked) colour="\033[0;33m" ;; # yellow
- ERROR) colour="\033[0;31m" ;; # red
- *) colour="" ;;
- esac
- reset="\033[0m"
- printf " %-32s %6s %5s ${colour}%s${reset}\n" "$n" "$s" "$w" "$st"
- else
- printf " %-32s %6s %5s %s\n" "$n" "$s" "$w" "$st"
- fi
- done
- echo " ------------------------------------------------------------------------"
- printf " %-32s %6s %5s\n" "TOTAL (${count} scrapers)" "$total_saved" "$total_warns"
- echo "========================================================================"
- # ---------------------------------------------------------------------------
- # Email summary if any scraper errored (requires SMTP_HOST to be configured)
- # ---------------------------------------------------------------------------
- has_errors=0
- for entry in "${SUMMARY[@]}"; do
- IFS='|' read -r _n _s _w st <<< "$entry"
- if [[ "$st" == "ERROR" ]]; then
- has_errors=1
- break
- fi
- done
- if [[ $has_errors -eq 1 ]] && [[ -n "${SMTP_HOST:-}" ]]; then
- echo ""
- echo "Sending error summary email..."
- {
- echo "$finish_time"
- echo "$count"
- echo "$total_saved"
- echo "$total_warns"
- for entry in "${SUMMARY[@]}"; do
- echo "$entry"
- done
- } | ruby /app/tools/send_summary_email.rb
- fi
|