hobartcity.rb 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101
  1. # Hobart City Council – PlanBuild "Currently Advertised" scraper
  2. # Table name is injected by run_all.sh as TABLE_NAME=da_hobartcity
  3. require "nokogiri"
  4. require "open-uri"
  5. require_relative "../lib/db"
  6. require_relative "../lib/util"
  7. require_relative "../lib/http"
  8. require_relative "../lib/enrich"
  9. TABLE = ENV.fetch("TABLE_NAME")
  10. URL = "https://portal.planbuild.tas.gov.au/external/advertisement/search"
  11. # Optional: restrict results to one LGA (substring match)
  12. COUNCIL_FILTER = ENV.fetch("COUNCIL_FILTER", "Hobart City Council").strip
  13. DB.ensure_table!(TABLE)
  14. html = Http.get(URL)
  15. doc = Nokogiri::HTML(html)
  16. # PlanBuild markup shifts occasionally. We try a few result wrappers.
  17. result_blocks = doc.css(".advertisement-result, .panel.panel-default, .panel.panel-info, .result-row, .row")
  18. found = 0
  19. result_blocks.each do |blk|
  20. text = blk.text.strip.gsub(/\s+/, " ")
  21. # Skip blocks that do not look like a single advertised item
  22. next unless text.match?(/Application/i) || text.match?(/Reference/i) || text.match?(/Council/i)
  23. # Extract fields using common column patterns first
  24. address_el = blk.at_css(".col-xs-8, .col-sm-8, .address, [data-field='address']")
  25. ref_el = blk.at_css(".col-xs-4, .col-sm-4, .reference, [data-field='reference']")
  26. address = address_el&.text&.strip.to_s
  27. council_reference = ref_el&.text&.strip.to_s
  28. # Fallbacks from label-value pairs (e.g., "Address: …", "Reference: …")
  29. if address.empty?
  30. m = text.match(/Address:\s*(.+?)(?:\s{2,}|Reference:|$)/i)
  31. address = m[1].strip if m
  32. end
  33. if council_reference.empty?
  34. # m = text.match(/(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-\./_]+)/i)
  35. REF_RX = %r{(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-._/]+)}i
  36. m = text.match(REF_RX)
  37. council_reference = (m && m[3]) ? m[3].strip : council_reference
  38. end
  39. # Try to find the LGA/council name in the block text
  40. # Common patterns: "Council: Hobart City Council" or a badge/label nearby
  41. council_name = nil
  42. if (m = text.match(/Council:\s*([A-Za-z \-]+Council)/i))
  43. council_name = m[1].strip
  44. end
  45. # Light filter: if a filter is set and we can't see Hobart in this block, skip it
  46. if COUNCIL_FILTER != "" && council_name && !council_name.include?(COUNCIL_FILTER)
  47. next
  48. elsif COUNCIL_FILTER != "" && council_name.nil?
  49. # If no explicit council field, do a substring check across the block text
  50. next unless text.include?(COUNCIL_FILTER)
  51. end
  52. # Optional extras if present in the block
  53. # Patterns seen across councils vary, so treat all as best-effort
  54. description = ""
  55. if (m = text.match(/(Type of Work|Proposal|Description):\s*(.+?)(?:\s{2,}|Address:|Application|Reference|$)/i))
  56. description = m[2].strip
  57. end
  58. date_received_raw = ""
  59. if (m = text.match(/(Date Lodged|Date Received|Lodged):\s*([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})/i))
  60. date_received_raw = m[2].strip
  61. end
  62. date_received = Util.parse_aus_date(date_received_raw)
  63. # If we still don't have key fields, skip
  64. next if address.empty? || council_reference.empty?
  65. DB.upsert(TABLE, {
  66. description: description,
  67. date_received: date_received,
  68. date_received_raw: date_received_raw,
  69. address: address,
  70. council_reference: council_reference,
  71. applicant: "", # PlanBuild usually doesn't expose these in the list
  72. owner: ""
  73. })
  74. enrich_after_upsert!(
  75. table: TABLE,
  76. council_reference: council_reference,
  77. address: address
  78. )
  79. puts "Upserted #{council_reference} | #{address}"
  80. found += 1
  81. end
  82. puts "Done #{TABLE}. Found #{found} item(s)."