| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- # Dorset Council — Advertised Development Applications
- #
- # Source: https://www.dorset.tas.gov.au/online-development-application-enquiry
- #
- # Page structure — each application is a <p><a href="PDF_URL">text</a></p>:
- #
- # PLA/2026/22: Residential dwelling and carport addition - Chris Triebe
- # and Associates Town Planning Services - 13 Gladstone Road
- # Herrick - Closes 18.04.2026
- #
- # Text format: REF: DESCRIPTION - APPLICANT - ADDRESS - Closes DD.MM.YYYY
- #
- # Note: the old eServices portal (eservices.dorset.tas.gov.au) is still live
- # and was the previous data source. The council now publishes the advertised
- # list on their main website with direct PDF links, which is simpler to scrape.
- require "date"
- require "nokogiri"
- require "uri"
- require "fileutils"
- require_relative "../lib/scraper_helpers"
- require_relative "../lib/util"
- require_relative "../lib/log"
- TABLE = ENV.fetch("TABLE_NAME")
- URL = "https://www.dorset.tas.gov.au/online-development-application-enquiry"
- DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
- DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
- DB.ensure_table!(TABLE)
- REF_RX = /\bPLA\/\d{4}\/\d+\b/i
- CLOSE_RX = /\bCloses\s+(\d{1,2}[.\-]\d{1,2}[.\-]\d{4})\b/i
- def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
- def download_pdf(url, council_reference)
- return nil if url.to_s.strip.empty?
- dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
- FileUtils.mkdir_p(dir)
- fname = safe_name(File.basename(URI.parse(url).path))
- fname = "document.pdf" if fname.empty?
- path = File.join(dir, fname)
- body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
- File.binwrite(path, body)
- puts " saved #{fname} (#{body.bytesize} bytes)"
- "/files/dorset/#{safe_name(council_reference)}/#{fname}"
- rescue StandardError => e
- Log.warn "dorset", "Download failed for #{url}: #{e.class} #{e.message}"
- nil
- end
- html = Http.get(URL)
- doc = Nokogiri::HTML(html)
- items = []
- doc.css("p a[href]").each do |a|
- text = a.text.gsub(/[[:space:]]+/, " ").strip
- next unless (ref_m = text.match(REF_RX))
- ref = ref_m[0]
- # Strip "PLA/YYYY/NNN: " prefix
- remainder = text.sub(/\A#{Regexp.escape(ref)}:\s*/i, "")
- # Extract and strip closing date from the end
- close_raw = ""
- on_notice_to = nil
- if (close_m = remainder.match(CLOSE_RX))
- close_raw = close_m[1]
- on_notice_to = Date.strptime(close_raw, "%d.%m.%Y") rescue nil
- remainder = remainder.sub(/\s*-\s*#{Regexp.escape(close_m[0])}\s*\z/i, "").strip
- end
- # Remaining text: "Description - Applicant - Address"
- # Split on " - "; last part = address, second-to-last = applicant, rest = description
- parts = remainder.split(/\s+-\s+/)
- if parts.length >= 3
- address = parts.last.strip
- applicant = parts[-2].strip
- description = parts[0..-3].join(" - ").strip
- elsif parts.length == 2
- address = parts.last.strip
- applicant = ""
- description = parts.first.strip
- else
- address = remainder.strip
- applicant = ""
- description = "Development Application"
- end
- next if address.empty?
- description = "Development Application" if description.empty?
- pdf_url = abs_url(URL, a["href"].to_s.strip)
- items << {
- council_reference: ref,
- address: address,
- description: description,
- applicant: applicant,
- on_notice_to: on_notice_to,
- on_notice_to_raw: close_raw,
- document_url: pdf_url
- }
- end
- puts "Found #{items.length} item(s) for #{TABLE}"
- items.each do |r|
- local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
- upsert_and_enrich!(
- table: TABLE,
- row: {
- council_reference: r[:council_reference],
- address: r[:address],
- description: r[:description],
- applicant: r[:applicant],
- on_notice_to: r[:on_notice_to],
- on_notice_to_raw: r[:on_notice_to_raw],
- owner: ""
- },
- extras: {
- document_url: r[:document_url],
- local_document_url: local_url
- }
- )
- end
- puts "Done #{TABLE}. Saved #{items.length} item(s)."
|