| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293 |
- # Latrobe Council – PlanBuild "Currently Advertised" scraper
- require "nokogiri"
- require_relative "../lib/http"
- require_relative "../lib/db"
- require_relative "../lib/util"
- require_relative "../lib/enrich"
- TABLE = ENV.fetch("TABLE_NAME")
- URL = ENV.fetch("PLANBUILD_URL", "https://portal.planbuild.tas.gov.au/external/advertisement/search")
- COUNCIL_NAME = "Latrobe Council"
- # Safe reference matcher (slashes inside are fine with %r{...})
- REF_RX = %r{(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-._/]+)}i
- DB.ensure_table!(TABLE)
- def extract_text_between(text, label_regex, stop_regexes)
- if (m = text.match(label_regex))
- start = m.end(0)
- tail = text[start..-1]
- stop = stop_regexes.map { |r| (tail =~ r) }.compact.min
- stop ? tail[0...stop].strip : tail.strip
- end
- end
- html = Http.get(URL)
- doc = Nokogiri::HTML(html)
- blocks = doc.css(".advertisement-result, .panel.panel-default, .panel.panel-info, .result-row, .row")
- saved = 0
- blocks.each do |blk|
- text = blk.text.strip.gsub(/\s+/, " ")
- next unless text.match?(/Application|Reference|Council/i)
- address_el = blk.at_css(".address, [data-field='address'], .col-xs-8, .col-sm-8")
- ref_el = blk.at_css(".reference, [data-field='reference'], .col-xs-4, .col-sm-4")
- address = address_el&.text&.strip.to_s
- council_reference = ref_el&.text&.strip.to_s
- address = extract_text_between(text, /Address:\s*/i,
- [/Reference:/i, /Application/i, /Council:/i, /\z/]) if address.empty?
- if council_reference.empty?
- if (m = text.match(REF_RX))
- council_reference = m[3].strip
- end
- end
- council_name = if (m = text.match(/Council:\s*([A-Za-z \-]+Council)/i))
- m[1].strip
- end
- next unless council_name&.include?(COUNCIL_NAME)
- description = extract_text_between(
- text,
- /(Type of Work|Proposal|Description):\s*/i,
- [/Address:/i, /Application/i, /Reference/i, /Council:/i, /\z/]
- ) || ""
- date_received_raw =
- if (m = text.match(/(Date Lodged|Date Received|Lodged):\s*([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})/i))
- m[2].strip
- else
- ""
- end
- date_received = Util.parse_aus_date(date_received_raw)
- next if address.empty? || council_reference.empty?
- DB.upsert(TABLE, {
- description: description,
- date_received: date_received,
- date_received_raw: date_received_raw,
- address: address,
- council_reference: council_reference,
- applicant: "",
- owner: ""
- })
-
- enrich_after_upsert!(
- table: TABLE,
- council_reference: council_reference,
- address: address
- )
- puts "Upserted #{council_reference} | #{address}"
- saved += 1
- end
- puts "Done #{TABLE}. Saved #{saved} item(s)."
|