| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101 |
- # Hobart City Council – PlanBuild "Currently Advertised" scraper
- # Table name is injected by run_all.sh as TABLE_NAME=da_hobartcity
- require "nokogiri"
- require "open-uri"
- require_relative "../lib/db"
- require_relative "../lib/util"
- require_relative "../lib/http"
- require_relative "../lib/enrich"
- TABLE = ENV.fetch("TABLE_NAME")
- URL = "https://portal.planbuild.tas.gov.au/external/advertisement/search"
- # Optional: restrict results to one LGA (substring match)
- COUNCIL_FILTER = ENV.fetch("COUNCIL_FILTER", "Hobart City Council").strip
- DB.ensure_table!(TABLE)
- html = Http.get(URL)
- doc = Nokogiri::HTML(html)
- # PlanBuild markup shifts occasionally. We try a few result wrappers.
- result_blocks = doc.css(".advertisement-result, .panel.panel-default, .panel.panel-info, .result-row, .row")
- found = 0
- result_blocks.each do |blk|
- text = blk.text.strip.gsub(/\s+/, " ")
- # Skip blocks that do not look like a single advertised item
- next unless text.match?(/Application/i) || text.match?(/Reference/i) || text.match?(/Council/i)
- # Extract fields using common column patterns first
- address_el = blk.at_css(".col-xs-8, .col-sm-8, .address, [data-field='address']")
- ref_el = blk.at_css(".col-xs-4, .col-sm-4, .reference, [data-field='reference']")
- address = address_el&.text&.strip.to_s
- council_reference = ref_el&.text&.strip.to_s
- # Fallbacks from label-value pairs (e.g., "Address: …", "Reference: …")
- if address.empty?
- m = text.match(/Address:\s*(.+?)(?:\s{2,}|Reference:|$)/i)
- address = m[1].strip if m
- end
- if council_reference.empty?
- # m = text.match(/(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-\./_]+)/i)
- REF_RX = %r{(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-._/]+)}i
- m = text.match(REF_RX)
- council_reference = (m && m[3]) ? m[3].strip : council_reference
- end
- # Try to find the LGA/council name in the block text
- # Common patterns: "Council: Hobart City Council" or a badge/label nearby
- council_name = nil
- if (m = text.match(/Council:\s*([A-Za-z \-]+Council)/i))
- council_name = m[1].strip
- end
- # Light filter: if a filter is set and we can't see Hobart in this block, skip it
- if COUNCIL_FILTER != "" && council_name && !council_name.include?(COUNCIL_FILTER)
- next
- elsif COUNCIL_FILTER != "" && council_name.nil?
- # If no explicit council field, do a substring check across the block text
- next unless text.include?(COUNCIL_FILTER)
- end
- # Optional extras if present in the block
- # Patterns seen across councils vary, so treat all as best-effort
- description = ""
- if (m = text.match(/(Type of Work|Proposal|Description):\s*(.+?)(?:\s{2,}|Address:|Application|Reference|$)/i))
- description = m[2].strip
- end
- date_received_raw = ""
- if (m = text.match(/(Date Lodged|Date Received|Lodged):\s*([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})/i))
- date_received_raw = m[2].strip
- end
- date_received = Util.parse_aus_date(date_received_raw)
- # If we still don't have key fields, skip
- next if address.empty? || council_reference.empty?
- DB.upsert(TABLE, {
- description: description,
- date_received: date_received,
- date_received_raw: date_received_raw,
- address: address,
- council_reference: council_reference,
- applicant: "", # PlanBuild usually doesn't expose these in the list
- owner: ""
- })
-
- enrich_after_upsert!(
- table: TABLE,
- council_reference: council_reference,
- address: address
- )
- puts "Upserted #{council_reference} | #{address}"
- found += 1
- end
- puts "Done #{TABLE}. Saved #{found} item(s)."
|