circularhead.rb 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. # Circular Head Council — Planning page list scraper
  2. require "nokogiri"
  3. require_relative "../lib/enrich"
  4. require_relative "../lib/log"
  5. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_circularhead
  6. URL = "https://www.circularhead.tas.gov.au/council-services/development/planning"
  7. DB.ensure_table!(TABLE)
  8. def abs_url(base, href)
  9. return "" if href.to_s.strip.empty?
  10. URI.join(base, href).to_s rescue href.to_s
  11. end
  12. html = Http.get(URL)
  13. doc = Nokogiri::HTML(html)
  14. items = doc.css("li.link-listing__no-icon")
  15. puts "Found #{items.length} items for #{TABLE}"
  16. saved = 0
  17. items.each_with_index do |li, idx|
  18. a = li.at_css("a")
  19. next unless a
  20. title_reference = a.text.to_s.strip
  21. href = a["href"].to_s
  22. document_url = abs_url(URL, href)
  23. # Your original logic: split the title on " - "
  24. parts = title_reference.split(" - ")
  25. council_reference = parts.first.to_s.strip
  26. description_part = parts.last.to_s
  27. description = description_part.split("(").first.to_s.strip
  28. address = if parts.length > 2
  29. parts[1..-2].join(" - ").strip
  30. else
  31. # fallback to a trimmed title if no middle section
  32. title_reference[0, 140]
  33. end
  34. # No dates on the list view
  35. date_received_raw = ""
  36. date_received = nil
  37. # Require the key fields
  38. next if council_reference.empty? || address.empty?
  39. DB.upsert(TABLE, {
  40. description: description,
  41. date_received: date_received,
  42. date_received_raw: date_received_raw,
  43. address: address,
  44. council_reference: council_reference,
  45. applicant: "",
  46. owner: ""
  47. })
  48. enrich_after_upsert!(
  49. table: TABLE,
  50. council_reference: council_reference,
  51. address: address
  52. )
  53. # Save link and title if the columns exist
  54. begin
  55. upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, title_reference = ? WHERE council_reference = ? AND address = ?")
  56. upd.execute(document_url, title_reference, council_reference, address)
  57. rescue Mysql2::Error => e
  58. Log.warn "scraper", "[circularhead] db update skipped for #{council_reference}: #{e.message}"
  59. end
  60. puts "Upserted #{council_reference} -> #{address}"
  61. saved += 1
  62. end
  63. puts "Done #{TABLE}. Saved #{saved} item(s)."