glamorgan.rb 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. # Glamorgan Spring Bay Council — Public Notices scraper (site page, not PlanBuild)
  2. require "nokogiri"
  3. require_relative "../lib/http"
  4. require_relative "../lib/db"
  5. require_relative "../lib/util"
  6. require_relative "../lib/enrich"
  7. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh sets from filename: da_glamorgan
  8. URL = "https://gsbc.tas.gov.au/services-facilities/public-notices/"
  9. DB.ensure_table!(TABLE)
  10. # Optional column to store the PDF or document link
  11. begin
  12. DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url VARCHAR(1024) NULL")
  13. rescue => e
  14. warn "document_url add skipped: #{e.class} #{e.message}"
  15. end
  16. def text_or(node, default = "")
  17. node ? node.text.strip : default
  18. end
  19. # Try to extract a reference from visible text or file names
  20. REF_RX = %r{(DA|PLA|APP|APPLICATION)\s*([0-9]{4})\s*/\s*([A-Za-z0-9\-_.]+)}i
  21. def extract_reference(str)
  22. s = str.to_s
  23. if (m = s.match(REF_RX))
  24. return "#{m[1].upcase} #{m[2]} / #{m[3]}"
  25. end
  26. nil
  27. end
  28. def safe_abs(base, href)
  29. return "" if href.to_s.strip.empty?
  30. URI.join(base, href).to_s
  31. rescue URI::InvalidURIError
  32. href.to_s
  33. end
  34. html = Http.get(URL)
  35. doc = Nokogiri::HTML(html)
  36. # Find a table that looks like the notices list
  37. tables = doc.css("table")
  38. table = tables.find do |t|
  39. headers = t.css("thead th").map { |th| th.text.strip.downcase }
  40. headers.empty? ? t.css("tbody tr:first-child td").size >= 4 : headers.size >= 4
  41. end
  42. unless table
  43. puts "No notices table found on #{URL}"
  44. exit 0
  45. end
  46. rows = table.css("tbody tr")
  47. puts "Found #{rows.length} rows for #{TABLE}"
  48. saved = 0
  49. rows.each_with_index do |row, idx|
  50. tds = row.css("td")
  51. next if tds.empty?
  52. # Expected columns on this page:
  53. # 0: Description, 1: Address, 2: Application date, 3: On notice to, 4: Document link
  54. description = text_or(tds[0])
  55. address = text_or(tds[1])
  56. application_raw = text_or(tds[2])
  57. on_notice_to_raw = text_or(tds[3])
  58. link_el = tds[4]&.at_css("a")
  59. document_url = link_el ? safe_abs(URL, link_el["href"]) : ""
  60. date_received = Util.parse_aus_date(application_raw)
  61. on_notice_to = Util.parse_aus_date(on_notice_to_raw)
  62. # Reference: try text, then file name, then a stable fallback
  63. council_reference = extract_reference(description) ||
  64. extract_reference(address) ||
  65. extract_reference(File.basename(document_url)) ||
  66. begin
  67. date_key = date_received || on_notice_to
  68. base = [date_key&.strftime("%Y%m%d"), address.gsub(/\s+/, " ")[0,40]].compact.join(" ")
  69. base.empty? ? "GSBC-ROW-#{idx+1}" : "GSBC #{base}"
  70. end
  71. next if address.empty? || council_reference.empty?
  72. DB.upsert(TABLE, {
  73. description: description,
  74. date_received: date_received,
  75. date_received_raw: application_raw,
  76. address: address,
  77. council_reference: council_reference,
  78. applicant: "",
  79. owner: ""
  80. })
  81. enrich_after_upsert!(
  82. table: TABLE,
  83. council_reference: council_reference,
  84. address: address
  85. )
  86. begin
  87. upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ? WHERE council_reference = ? AND address = ?")
  88. upd.execute(document_url, council_reference, address)
  89. rescue Mysql2::Error => e
  90. warn "[glamorgan] db update skipped for #{council_reference}: #{e.message}"
  91. end
  92. puts "Upserted #{council_reference} -> #{address}"
  93. saved += 1
  94. end
  95. puts "Done #{TABLE}. Saved #{saved} item(s)."