# Latrobe Council – PlanBuild “Currently Advertised” scraper require "nokogiri" require_relative "../lib/http" require_relative "../lib/db" require_relative "../lib/util" require_relative "../lib/enrich" TABLE = ENV.fetch("TABLE_NAME") URL = ENV.fetch("PLANBUILD_URL", "https://portal.planbuild.tas.gov.au/external/advertisement/search") COUNCIL_NAME = "Latrobe Council" # Safe reference matcher (slashes inside are fine with %r{...}) REF_RX = %r{(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-._/]+)}i DB.ensure_table!(TABLE) def extract_text_between(text, label_regex, stop_regexes) if (m = text.match(label_regex)) start = m.end(0) tail = text[start..-1] stop = stop_regexes.map { |r| (tail =~ r) }.compact.min stop ? tail[0...stop].strip : tail.strip end end html = Http.get(URL) doc = Nokogiri::HTML(html) blocks = doc.css(".advertisement-result, .panel.panel-default, .panel.panel-info, .result-row, .row") saved = 0 blocks.each do |blk| text = blk.text.strip.gsub(/\s+/, " ") next unless text.match?(/Application|Reference|Council/i) address_el = blk.at_css(".address, [data-field='address'], .col-xs-8, .col-sm-8") ref_el = blk.at_css(".reference, [data-field='reference'], .col-xs-4, .col-sm-4") address = address_el&.text&.strip.to_s council_reference = ref_el&.text&.strip.to_s address = extract_text_between(text, /Address:\s*/i, [/Reference:/i, /Application/i, /Council:/i, /\z/]) if address.empty? if council_reference.empty? if (m = text.match(REF_RX)) council_reference = m[3].strip end end council_name = if (m = text.match(/Council:\s*([A-Za-z \-]+Council)/i)) m[1].strip end next unless council_name&.include?(COUNCIL_NAME) description = extract_text_between( text, /(Type of Work|Proposal|Description):\s*/i, [/Address:/i, /Application/i, /Reference/i, /Council:/i, /\z/] ) || "" date_received_raw = if (m = text.match(/(Date Lodged|Date Received|Lodged):\s*([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})/i)) m[2].strip else "" end date_received = Util.parse_aus_date(date_received_raw) next if address.empty? || council_reference.empty? DB.upsert(TABLE, { description: description, date_received: date_received, date_received_raw: date_received_raw, address: address, council_reference: council_reference, applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) puts "Upserted #{council_reference} | #{address}" saved += 1 end puts "Done #{TABLE}. Saved #{saved} item(s)."