# Devonport City Council — Advertised Planning Permit Applications (WP File Download) require "date" require "nokogiri" require "fileutils" require "net/http" require "uri" require_relative "../lib/enrich" require_relative "../lib/log" require_relative "../lib/util" TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_devonportcity URL = "https://www.devonport.tas.gov.au/building-development/planning/advertised-planning-permit-applications/" DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1" DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads" DB.ensure_table!(TABLE) def abs_url(base, href) return "" if href.to_s.strip.empty? URI.join(base, href).to_s rescue href.to_s end def sanitize_filename(s) s.to_s.gsub(/[^\w.\-]+/, "_")[0, 180] end # ---------- Reference + date helpers ---------- # Accepts PA/DA with separators like ".", "/", "-" and optional spaces def normalize_ref(str) s = str.to_s.strip if (m = s.match(/\b(PA|DA)\s*([12]\d{3})[.\-\/\s]+([A-Za-z0-9]{3,})\b/i)) "#{m[1].upcase} #{m[2]} / #{m[3]}" elsif (m = s.match(/\bpa([12]\d{3})[.\-]([A-Za-z0-9]{3,})\b/i)) "PA #{m[1]} / #{m[2]}" elsif (m = s.match(/\bda([12]\d{3})[.\-]([A-Za-z0-9]{3,})\b/i)) "DA #{m[1]} / #{m[2]}" else nil end end def extract_date_token(str) s = str.to_s return s[/\b\d{1,2}\/\d{1,2}\/\d{2,4}\b/] || s[/\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b/] || s[/\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b/] || s[/\b\d{1,2}-\d{1,2}-\d{4}\b/] || "" end def parse_date_any(s) return nil if s.to_s.strip.empty? Util.parse_aus_date(s) rescue nil end def extract_on_notice_to_from_title(title) # Prefer explicit phrase if (m = title.to_s.match(/advertising\s+period\s+ends?\s+(.+?)\s*(?:$|\(|-)/i)) tkn = extract_date_token(m[1]) return parse_date_any(tkn), tkn unless tkn.empty? end # Fallback: any date token in the title tkn = extract_date_token(title) [parse_date_any(tkn), tkn] end # ---------- Simple PDF downloader ---------- def download_pdf(url, council_reference) return if url.to_s.strip.empty? return unless DOWNLOAD_ATTACHMENTS uri = URI(url) out_dir = File.join(DOWNLOAD_DIR, TABLE) FileUtils.mkdir_p(out_dir) base = sanitize_filename(File.basename(uri.path)) prefix = sanitize_filename(council_reference.to_s.gsub(" / ", "-")) out_path = File.join(out_dir, "#{prefix}__#{base}") Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https") do |http| req = Net::HTTP::Get.new(uri) req["User-Agent"] = "Mozilla/5.0" req["Accept"] = "application/pdf,*/*;q=0.8" req["Accept-Encoding"] = "identity" http.request(req) do |resp| if resp.code.to_i == 200 File.open(out_path, "wb") do |f| resp.read_body { |chunk| f.write(chunk) } end puts "Saved PDF to #{out_path}" else Log.warn "scraper", "PDF fetch failed (#{resp.code} #{resp.message}) for #{url}" end end end rescue StandardError => e Log.warn "scraper", "PDF save error for #{url}: #{e.class} #{e.message}" end # ---------- Fetch + parse ---------- html = Http.get(URL) doc = Nokogiri::HTML(html) # Devonport uses WP File Download. Rows live under .wpfd-search-result > table > tbody > tr rows = doc.css(".wpfd-search-result tbody tr") puts "Found #{rows.length} row(s) for #{TABLE}" saved = 0 rows.each_with_index do |row, idx| link = row.at_css("a.wpfd_downloadlink") next unless link title_reference = link["title"].to_s.strip href = link["href"].to_s.strip document_url = abs_url(URL, href) # Typical title: # "PA2025.0103 - 11-17 Stewart Street Devonport - Signage - Advertising period ends 2 September 2025" parts = title_reference.split(" - ").map(&:strip) raw_ref = parts[0].to_s address = parts[1].to_s # Description: all middle parts until the last one (often the date/notice bit) middle = (parts[2..-1] || []) # Pull out on-notice first so we can filter on_notice_to, on_notice_to_raw = extract_on_notice_to_from_title(title_reference) middle.reject! { |p| p =~ /advertising\s+period\s+ends?/i || p == on_notice_to_raw } description = middle.join(" - ").strip description = "Development Application" if description.empty? # Date added column -> date_received date_received_raw = row.at_css(".file_created")&.text&.strip.to_s date_received = parse_date_any(date_received_raw) if date_received.nil? && !date_received_raw.empty? # handle 19-08-2025 begin date_received = Date.strptime(date_received_raw, "%d-%m-%Y") rescue ArgumentError, Date::Error date_received = parse_date_any(date_received_raw) end end # Normalize / derive council_reference council_reference = normalize_ref(raw_ref) || normalize_ref(title_reference) || normalize_ref(document_url) || raw_ref # last resort (raw) # Fallback address if missing if address.to_s.empty? # Try data-filetitle on hidden input (same text as title) hidden_title = row.at_css("input.wpfd_file_preview_link_download")&.[]("data-filetitle").to_s parts2 = hidden_title.split(" - ").map(&:strip) address = parts2[1].to_s unless parts2.empty? address = title_reference[0, 140] if address.to_s.empty? end next if council_reference.to_s.empty? || address.to_s.empty? # Download PDF if requested download_pdf(document_url, council_reference) DB.upsert(TABLE, { description: description, date_received: date_received, date_received_raw: date_received_raw, on_notice_to: on_notice_to, on_notice_to_raw: on_notice_to_raw, address: address, council_reference: council_reference, applicant: "", owner: "" }) enrich_after_upsert!( table: TABLE, council_reference: council_reference, address: address ) # Store extras if columns exist begin upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, title_reference = ? WHERE council_reference = ? AND address = ?") upd.execute(document_url, title_reference, council_reference, address) rescue Mysql2::Error => e Log.warn "scraper", "[devonportcity] db update skipped for #{council_reference}: #{e.message}" end puts "Upserted #{council_reference} -> #{address}" saved += 1 end puts "Done #{TABLE}. Saved #{saved} item(s)."