|
@@ -1,144 +1,167 @@
|
|
|
# West Tamar Council — Advertised Planning Applications
|
|
# West Tamar Council — Advertised Planning Applications
|
|
|
|
|
+#
|
|
|
|
|
+# Source: https://www.wtc.tas.gov.au/advertised-planning-applications/
|
|
|
|
|
+#
|
|
|
|
|
+# Page structure — all entries on one page, grouped by h2 headings:
|
|
|
|
|
+#
|
|
|
|
|
+# <h2>92 Sunset Boulevard, Clarence Point</h2>
|
|
|
|
|
+# <p>
|
|
|
|
|
+# <strong>APPLICANT:</strong> J & E West<br>
|
|
|
|
|
+# <strong>PROPOSAL:</strong> Residential - Dwelling & Outbuilding<br>
|
|
|
|
|
+# <strong>LOCATION:</strong> 92 Sunset Boulevard, Clarence Point<br>
|
|
|
|
|
+# <strong>CLOSES:</strong> 5pm on 16 April 2026
|
|
|
|
|
+# </p>
|
|
|
|
|
+# <ul>
|
|
|
|
|
+# <li>Application Number: PA NO: 2025065</li>
|
|
|
|
|
+# <li>Closes 16 April 2026</li>
|
|
|
|
|
+# </ul>
|
|
|
|
|
+# <p><a href="https://assets.wtc.tas.gov.au/...PA2025065...pdf">Proposal description</a></p>
|
|
|
|
|
|
|
|
require "nokogiri"
|
|
require "nokogiri"
|
|
|
|
|
+require "uri"
|
|
|
|
|
+require "fileutils"
|
|
|
|
|
|
|
|
require_relative "../lib/scraper_helpers"
|
|
require_relative "../lib/scraper_helpers"
|
|
|
require_relative "../lib/util"
|
|
require_relative "../lib/util"
|
|
|
require_relative "../lib/log"
|
|
require_relative "../lib/log"
|
|
|
-TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_westtamar
|
|
|
|
|
-URL = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
|
|
|
|
|
|
|
+
|
|
|
|
|
+TABLE = ENV.fetch("TABLE_NAME")
|
|
|
|
|
+URL = "https://www.wtc.tas.gov.au/advertised-planning-applications/"
|
|
|
|
|
+DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
|
|
|
|
|
+DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
|
|
|
|
|
|
|
|
DB.ensure_table!(TABLE)
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
|
|
-REF_RX_SLASH = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
|
|
|
|
|
-REF_RX_HYPHEN = %r{\bDA\s*(\d{1,4})\s*-\s*(20\d{2})\b}i
|
|
|
|
|
-def extract_ref(text)
|
|
|
|
|
- s = text.to_s
|
|
|
|
|
- if (m = s.match(REF_RX_SLASH))
|
|
|
|
|
- return "DA #{m[1]} / #{m[2]}"
|
|
|
|
|
- end
|
|
|
|
|
- if (m = s.match(REF_RX_HYPHEN))
|
|
|
|
|
- return "DA #{m[2]} / #{m[1]}"
|
|
|
|
|
- end
|
|
|
|
|
- if (m = s.match(/\bDA(20\d{2})(\d{3,})\b/i))
|
|
|
|
|
- return "DA #{m[1]} / #{m[2]}"
|
|
|
|
|
- end
|
|
|
|
|
|
|
+def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
|
|
|
|
|
+
|
|
|
|
|
+def download_pdf(url, council_reference)
|
|
|
|
|
+ return nil if url.to_s.strip.empty?
|
|
|
|
|
+
|
|
|
|
|
+ dir = File.join(DOWNLOAD_DIR, "westtamar", safe_name(council_reference))
|
|
|
|
|
+ FileUtils.mkdir_p(dir)
|
|
|
|
|
+
|
|
|
|
|
+ fname = safe_name(File.basename(URI.parse(url).path))
|
|
|
|
|
+ fname = "document.pdf" if fname.empty?
|
|
|
|
|
+ path = File.join(dir, fname)
|
|
|
|
|
+
|
|
|
|
|
+ body = Http.get(url)
|
|
|
|
|
+ File.binwrite(path, body)
|
|
|
|
|
+ puts " saved #{fname} (#{body.bytesize} bytes)"
|
|
|
|
|
+
|
|
|
|
|
+ "/files/westtamar/#{safe_name(council_reference)}/#{fname}"
|
|
|
|
|
+rescue StandardError => e
|
|
|
|
|
+ Log.warn "westtamar", "Download failed for #{url}: #{e.class} #{e.message}"
|
|
|
nil
|
|
nil
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-def extract_date_like(str)
|
|
|
|
|
- s = str.to_s
|
|
|
|
|
- return $1 if s =~ /(\b\d{1,2}\/\d{1,2}\/\d{2,4}\b)/
|
|
|
|
|
- return $1 if s =~ /(\b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b)/
|
|
|
|
|
- return $1 if s =~ /(\b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)/
|
|
|
|
|
- ""
|
|
|
|
|
|
|
+# Parse "<strong>KEY:</strong> VALUE<br>" pairs from a <p> node
|
|
|
|
|
+def parse_strong_labels(p_node)
|
|
|
|
|
+ kv = {}
|
|
|
|
|
+ return kv unless p_node
|
|
|
|
|
+
|
|
|
|
|
+ # Replace <br> with newlines so we can split cleanly
|
|
|
|
|
+ html = p_node.inner_html.gsub(/<br\s*\/?>/i, "\n")
|
|
|
|
|
+ Nokogiri::HTML.fragment(html).text.split("\n").each do |line|
|
|
|
|
|
+ line = line.gsub(/\u00a0|\s+/, " ").strip
|
|
|
|
|
+ next if line.empty?
|
|
|
|
|
+ if (m = line.match(/\A([A-Z][A-Z\s]{1,20}):\s*(.+)\z/))
|
|
|
|
|
+ kv[m[1].strip.upcase] = m[2].strip
|
|
|
|
|
+ end
|
|
|
|
|
+ end
|
|
|
|
|
+ kv
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-def extract_on_notice_raw(text)
|
|
|
|
|
- s = text.to_s.gsub(/\s+/, " ")
|
|
|
|
|
- if s =~ /\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
|
|
|
|
|
- d = extract_date_like($2)
|
|
|
|
|
- return d unless d.empty?
|
|
|
|
|
- end
|
|
|
|
|
- if s =~ /clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i
|
|
|
|
|
- d = extract_date_like($2)
|
|
|
|
|
- return d unless d.empty?
|
|
|
|
|
|
|
+html = Http.get(URL)
|
|
|
|
|
+doc = Nokogiri::HTML(html)
|
|
|
|
|
+items = []
|
|
|
|
|
+
|
|
|
|
|
+# Walk h2 elements; collect their following siblings until the next h2
|
|
|
|
|
+doc.css("h2").each do |h2|
|
|
|
|
|
+ sibling_nodes = []
|
|
|
|
|
+ sib = h2.next_sibling
|
|
|
|
|
+ while sib
|
|
|
|
|
+ break if sib.element? && sib.name == "h2"
|
|
|
|
|
+ sibling_nodes << sib if sib.element?
|
|
|
|
|
+ sib = sib.next_sibling
|
|
|
end
|
|
end
|
|
|
- extract_date_like(s)
|
|
|
|
|
-end
|
|
|
|
|
|
|
|
|
|
-def parse_detail(url)
|
|
|
|
|
- html = Http.get(url)
|
|
|
|
|
- doc = Nokogiri::HTML(html)
|
|
|
|
|
|
|
+ next if sibling_nodes.empty?
|
|
|
|
|
|
|
|
- # Try two-column detail table first
|
|
|
|
|
- kv = {}
|
|
|
|
|
- doc.css("table tr").each do |tr|
|
|
|
|
|
- cells = tr.css("th, td")
|
|
|
|
|
- next unless cells.length >= 2
|
|
|
|
|
- key = cells[0].text.strip
|
|
|
|
|
- val = cells[1].text.strip
|
|
|
|
|
- kv[key] = val unless key.empty?
|
|
|
|
|
- end
|
|
|
|
|
|
|
+ # Find the <p> containing APPLICANT/PROPOSAL/LOCATION/CLOSES labels
|
|
|
|
|
+ label_p = sibling_nodes.find { |n| n.name == "p" && n.text =~ /APPLICANT|PROPOSAL|LOCATION|CLOSES/i }
|
|
|
|
|
+ kv = parse_strong_labels(label_p)
|
|
|
|
|
|
|
|
- find = ->(rx) { kv.find { |k,_| k =~ rx }&.last.to_s.strip }
|
|
|
|
|
|
|
+ # Find the <ul> containing the application number
|
|
|
|
|
+ ul_node = sibling_nodes.find { |n| n.name == "ul" }
|
|
|
|
|
+ ul_text = ul_node&.text.to_s.gsub(/\u00a0|\s+/, " ")
|
|
|
|
|
|
|
|
- council_reference = find.call(/(Application\s*(No|Number|ID)|Reference)/i)
|
|
|
|
|
- address = find.call(/(Address|Location|Property)/i)
|
|
|
|
|
- description = find.call(/(Proposal|Description)/i)
|
|
|
|
|
- on_notice_raw = find.call(/(On\s*Notice\s*(until|to)|Closing\s*Date|Closes)/i)
|
|
|
|
|
- on_notice = Util.parse_aus_date(on_notice_raw)
|
|
|
|
|
- title_reference = doc.at_css("h1, .entry-title")&.text&.strip.to_s
|
|
|
|
|
|
|
+ # Find the <p> with a PDF link
|
|
|
|
|
+ pdf_p = sibling_nodes.find { |n| n.name == "p" && n.at_css("a[href]") }
|
|
|
|
|
+ pdf_link = pdf_p&.at_css("a[href]")
|
|
|
|
|
|
|
|
- # Fallbacks from page text if labels are missing
|
|
|
|
|
- if council_reference.empty?
|
|
|
|
|
- council_reference = extract_ref(title_reference) || extract_ref(doc.text)
|
|
|
|
|
|
|
+ # --- Reference: "PA NO: 2025065" from ul, or filename ---
|
|
|
|
|
+ ref = nil
|
|
|
|
|
+ if (m = ul_text.to_s.match(/PA\s*(?:NO:?)?\s*(\d{5,})/i))
|
|
|
|
|
+ ref = "PA #{m[1]}"
|
|
|
end
|
|
end
|
|
|
- address = title_reference if address.empty?
|
|
|
|
|
- description = "Development Application" if description.to_s.strip.empty?
|
|
|
|
|
- if on_notice.nil?
|
|
|
|
|
- guess = extract_on_notice_raw(doc.text)
|
|
|
|
|
- on_notice = Util.parse_aus_date(guess)
|
|
|
|
|
- on_notice_raw = guess if on_notice
|
|
|
|
|
|
|
+ if ref.nil? && pdf_link
|
|
|
|
|
+ href = pdf_link["href"].to_s
|
|
|
|
|
+ ref = href.match(/PA(\d{5,})/i)&.then { |mm| "PA #{mm[1]}" }
|
|
|
end
|
|
end
|
|
|
-
|
|
|
|
|
- pdf = doc.at_css("a[href$='.pdf'], a[href*='.pdf?']")&.[]("href")
|
|
|
|
|
- document_url = pdf ? abs_url(url, pdf) : ""
|
|
|
|
|
-
|
|
|
|
|
- return nil if council_reference.empty? || address.empty?
|
|
|
|
|
-
|
|
|
|
|
- {
|
|
|
|
|
- council_reference: council_reference,
|
|
|
|
|
- address: address,
|
|
|
|
|
- description: description,
|
|
|
|
|
- date_received: on_notice,
|
|
|
|
|
- date_received_raw: on_notice_raw.to_s,
|
|
|
|
|
- document_url: document_url,
|
|
|
|
|
- title_reference: title_reference
|
|
|
|
|
|
|
+ next unless ref
|
|
|
|
|
+
|
|
|
|
|
+ # --- Address from LOCATION label, fallback to h2 text ---
|
|
|
|
|
+ address = kv["LOCATION"] || kv["ADDRESS"] || h2.text.gsub(/\u00a0|\s+/, " ").strip
|
|
|
|
|
+ next if address.empty?
|
|
|
|
|
+
|
|
|
|
|
+ # --- Other fields ---
|
|
|
|
|
+ applicant = kv["APPLICANT"].to_s
|
|
|
|
|
+ description = kv["PROPOSAL"].to_s
|
|
|
|
|
+ description = "Development Application" if description.empty?
|
|
|
|
|
+
|
|
|
|
|
+ closes_raw = kv["CLOSES"].to_s
|
|
|
|
|
+ # Strip time prefix: "5pm on 16 April 2026" → "16 April 2026"
|
|
|
|
|
+ closes_raw = closes_raw.sub(/\A.*?\bon\s+/i, "").strip
|
|
|
|
|
+ # Also try list item: "Closes 16 April 2026"
|
|
|
|
|
+ if closes_raw.empty? && (m = ul_text.match(/Closes?\s+(\d{1,2}\s+[A-Za-z]+\s+\d{4})/i))
|
|
|
|
|
+ closes_raw = m[1]
|
|
|
|
|
+ end
|
|
|
|
|
+ on_notice_to = Util.parse_aus_date(closes_raw)
|
|
|
|
|
+
|
|
|
|
|
+ document_url = pdf_link ? abs_url(URL, pdf_link["href"].to_s) : ""
|
|
|
|
|
+
|
|
|
|
|
+ items << {
|
|
|
|
|
+ council_reference: ref,
|
|
|
|
|
+ address: address,
|
|
|
|
|
+ description: description,
|
|
|
|
|
+ applicant: applicant,
|
|
|
|
|
+ on_notice_to: on_notice_to,
|
|
|
|
|
+ on_notice_to_raw: closes_raw,
|
|
|
|
|
+ document_url: document_url
|
|
|
}
|
|
}
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-list_html = Http.get(URL)
|
|
|
|
|
-list_doc = Nokogiri::HTML(list_html)
|
|
|
|
|
-
|
|
|
|
|
-detail_links = list_doc.css("article h2 a, .entry-content a").map { |a|
|
|
|
|
|
- href = a["href"].to_s
|
|
|
|
|
- next if href.strip.empty? || href.start_with?("#")
|
|
|
|
|
- abs_url(URL, href)
|
|
|
|
|
-}.compact.uniq
|
|
|
|
|
-
|
|
|
|
|
-puts "Found #{detail_links.size} candidate link(s) for #{TABLE}"
|
|
|
|
|
|
|
+puts "Found #{items.length} item(s) for #{TABLE}"
|
|
|
|
|
|
|
|
-saved = 0
|
|
|
|
|
-
|
|
|
|
|
-detail_links.each do |u|
|
|
|
|
|
- begin
|
|
|
|
|
- item = parse_detail(u)
|
|
|
|
|
- rescue StandardError => e
|
|
|
|
|
- Log.warn "scraper", "Skip #{u}: #{e.class} #{e.message}"
|
|
|
|
|
- next
|
|
|
|
|
- end
|
|
|
|
|
- next unless item
|
|
|
|
|
|
|
+items.each do |r|
|
|
|
|
|
+ local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
|
|
|
|
|
|
|
|
upsert_and_enrich!(
|
|
upsert_and_enrich!(
|
|
|
table: TABLE,
|
|
table: TABLE,
|
|
|
row: {
|
|
row: {
|
|
|
- description: item[:description],
|
|
|
|
|
- date_received: item[:date_received],
|
|
|
|
|
- date_received_raw: item[:date_received_raw],
|
|
|
|
|
- address: item[:address],
|
|
|
|
|
- council_reference: item[:council_reference],
|
|
|
|
|
- applicant: "",
|
|
|
|
|
- owner: ""
|
|
|
|
|
|
|
+ council_reference: r[:council_reference],
|
|
|
|
|
+ address: r[:address],
|
|
|
|
|
+ description: r[:description],
|
|
|
|
|
+ applicant: r[:applicant],
|
|
|
|
|
+ on_notice_to: r[:on_notice_to],
|
|
|
|
|
+ on_notice_to_raw: r[:on_notice_to_raw],
|
|
|
|
|
+ owner: ""
|
|
|
},
|
|
},
|
|
|
extras: {
|
|
extras: {
|
|
|
- document_url: item[:document_url],
|
|
|
|
|
- on_notice_to: item[:date_received],
|
|
|
|
|
- on_notice_to_raw: item[:date_received_raw],
|
|
|
|
|
- title_reference: item[:title_reference]
|
|
|
|
|
|
|
+ document_url: r[:document_url],
|
|
|
|
|
+ local_document_url: local_url
|
|
|
}
|
|
}
|
|
|
)
|
|
)
|
|
|
- saved += 1
|
|
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-puts "Done #{TABLE}. Saved #{saved} item(s)."
|
|
|
|
|
|
|
+puts "Done #{TABLE}. Saved #{items.length} item(s)."
|