circularhead.rb 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. # Circular Head Council — Planning page list scraper
  2. require "nokogiri"
  3. require_relative "../lib/http"
  4. require_relative "../lib/db"
  5. require_relative "../lib/util"
  6. require_relative "../lib/enrich"
  7. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_circularhead
  8. URL = "https://www.circularhead.tas.gov.au/council-services/development/planning"
  9. DB.ensure_table!(TABLE)
  10. # Optional columns for extras
  11. begin
  12. DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url VARCHAR(1024) NULL")
  13. DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS title_reference TEXT NULL")
  14. rescue => e
  15. warn "Optional column add skipped: #{e.class} #{e.message}"
  16. end
  17. def abs_url(base, href)
  18. return "" if href.to_s.strip.empty?
  19. URI.join(base, href).to_s rescue href.to_s
  20. end
  21. html = Http.get(URL)
  22. doc = Nokogiri::HTML(html)
  23. items = doc.css("li.link-listing__no-icon")
  24. puts "Found #{items.length} items for #{TABLE}"
  25. saved = 0
  26. items.each_with_index do |li, idx|
  27. a = li.at_css("a")
  28. next unless a
  29. title_reference = a.text.to_s.strip
  30. href = a["href"].to_s
  31. document_url = abs_url(URL, href)
  32. # Your original logic: split the title on " - "
  33. parts = title_reference.split(" - ")
  34. council_reference = parts.first.to_s.strip
  35. description_part = parts.last.to_s
  36. description = description_part.split("(").first.to_s.strip
  37. address = if parts.length > 2
  38. parts[1..-2].join(" - ").strip
  39. else
  40. # fallback to a trimmed title if no middle section
  41. title_reference[0, 140]
  42. end
  43. # No dates on the list view
  44. date_received_raw = ""
  45. date_received = nil
  46. # Require the key fields
  47. next if council_reference.empty? || address.empty?
  48. DB.upsert(TABLE, {
  49. description: description,
  50. date_received: date_received,
  51. date_received_raw: date_received_raw,
  52. address: address,
  53. council_reference: council_reference,
  54. applicant: "",
  55. owner: ""
  56. })
  57. enrich_after_upsert!(
  58. table: TABLE,
  59. council_reference: council_reference,
  60. address: address
  61. )
  62. # Save link and title if the columns exist
  63. begin
  64. upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, title_reference = ? WHERE council_reference = ? AND address = ?")
  65. upd.execute(document_url, title_reference, council_reference, address)
  66. rescue Mysql2::Error => e
  67. warn "[circularhead] db update skipped for #{council_reference}: #{e.message}"
  68. end
  69. puts "Upserted #{council_reference} -> #{address}"
  70. saved += 1
  71. end
  72. puts "Done #{TABLE}. Saved #{saved} item(s)."