|
@@ -1,186 +1,102 @@
|
|
|
-# Northern Midlands Council — Advertised / Planning Applications (site page)
|
|
|
|
|
|
|
+# Northern Midlands Council — Advertised Planning Applications
|
|
|
|
|
+#
|
|
|
|
|
+# Source: https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2
|
|
|
|
|
+#
|
|
|
|
|
+# Page structure:
|
|
|
|
|
+# <h2>Closing 17 April 2026</h2>
|
|
|
|
|
+# <p>
|
|
|
|
|
+# <a href="...pdf"><strong>PLN-26-0030 - 13 Murray Street, Evandale:</strong></a>
|
|
|
|
|
+# (CT 21/1332) - Subdivision (2 Lot)
|
|
|
|
|
+# </p>
|
|
|
|
|
|
|
|
require "nokogiri"
|
|
require "nokogiri"
|
|
|
require "uri"
|
|
require "uri"
|
|
|
-require "cgi"
|
|
|
|
|
|
|
|
|
|
require_relative "../lib/scraper_helpers"
|
|
require_relative "../lib/scraper_helpers"
|
|
|
require_relative "../lib/util"
|
|
require_relative "../lib/util"
|
|
|
require_relative "../lib/log"
|
|
require_relative "../lib/log"
|
|
|
-TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_northernmidlands
|
|
|
|
|
|
|
+
|
|
|
|
|
+TABLE = ENV.fetch("TABLE_NAME")
|
|
|
URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
|
|
URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
|
|
|
|
|
|
|
|
DB.ensure_table!(TABLE)
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
|
|
-# "DA 2025/00123", "DA2025/00123", "Application No. DA 2025/123"
|
|
|
|
|
-REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
|
|
|
|
|
-REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i
|
|
|
|
|
-
|
|
|
|
|
-def extract_ref(str)
|
|
|
|
|
- s = CGI.unescape(str.to_s)
|
|
|
|
|
- if (m = s.match(REF_RX1))
|
|
|
|
|
- return "DA #{m[1]} / #{m[2]}"
|
|
|
|
|
- end
|
|
|
|
|
- if (m = s.match(REF_RX2))
|
|
|
|
|
- return "DA #{m[1]} / #{m[2]}"
|
|
|
|
|
- end
|
|
|
|
|
- nil
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-DATE_RX = /
|
|
|
|
|
- (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
|
|
|
|
|
- \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
|
|
|
|
|
- \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
|
|
|
|
|
-/x
|
|
|
|
|
-
|
|
|
|
|
-def extract_on_notice_raw(text)
|
|
|
|
|
- s = text.to_s.gsub(/\s+/, " ")
|
|
|
|
|
- if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
|
|
|
|
|
- if (d = m[2].match(DATE_RX))
|
|
|
|
|
- return d[1]
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
- if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
|
|
|
|
|
- if (d = m[2].match(DATE_RX))
|
|
|
|
|
- return d[1]
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
- if (d = s.match(DATE_RX))
|
|
|
|
|
- return d[1]
|
|
|
|
|
- end
|
|
|
|
|
- ""
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-def nearest_context_text(a)
|
|
|
|
|
- host = a.ancestors("li, p, div, tr, article").first || a.parent
|
|
|
|
|
- host ? host.text.to_s.strip.gsub(/\s+/, " ") : ""
|
|
|
|
|
-end
|
|
|
|
|
|
|
+REF_RX = /\bPLN-\d{2}-\d{4}\b/i
|
|
|
|
|
|
|
|
-def parse_items(doc, base_url)
|
|
|
|
|
- rows = []
|
|
|
|
|
|
|
+html = Http.get(URL)
|
|
|
|
|
+doc = Nokogiri::HTML(html)
|
|
|
|
|
|
|
|
- # 1) Obvious list items or rows with PDFs or application keywords
|
|
|
|
|
- anchors = doc.css("a").select { |a|
|
|
|
|
|
- href = a["href"].to_s
|
|
|
|
|
- a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf")
|
|
|
|
|
- }
|
|
|
|
|
|
|
+items = []
|
|
|
|
|
+closing_date = nil
|
|
|
|
|
+closing_date_raw = ""
|
|
|
|
|
|
|
|
- anchors.each do |a|
|
|
|
|
|
- href = a["href"].to_s
|
|
|
|
|
- link_text = a.text.to_s.strip
|
|
|
|
|
- document_url = abs_url(base_url, href)
|
|
|
|
|
- ctx = nearest_context_text(a)
|
|
|
|
|
-
|
|
|
|
|
- # Title to keep, if present
|
|
|
|
|
- title_reference = link_text.empty? ? ctx[0,200] : link_text
|
|
|
|
|
-
|
|
|
|
|
- text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ")
|
|
|
|
|
-
|
|
|
|
|
- # Address: prefer the link text, else the surrounding text slice
|
|
|
|
|
- address = if link_text.length >= 6
|
|
|
|
|
- link_text
|
|
|
|
|
- else
|
|
|
|
|
- ctx[0, 140]
|
|
|
|
|
|
|
+# Walk nodes in document order so h2 headings set the closing date for
|
|
|
|
|
+# the <p> entries that follow them.
|
|
|
|
|
+doc.css("h2, p").each do |node|
|
|
|
|
|
+ if node.name == "h2"
|
|
|
|
|
+ text = node.text.gsub(/\u00a0|\s+/, " ").strip
|
|
|
|
|
+ if (m = text.match(/Closing\s+(.+)/i))
|
|
|
|
|
+ closing_date_raw = m[1].strip
|
|
|
|
|
+ closing_date = Util.parse_aus_date(closing_date_raw)
|
|
|
end
|
|
end
|
|
|
|
|
+ next
|
|
|
|
|
+ end
|
|
|
|
|
|
|
|
- # Reference from text or file name
|
|
|
|
|
- ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url))
|
|
|
|
|
|
|
+ # <p> — look for a PLN reference inside a link
|
|
|
|
|
+ link = node.at_css("a[href]")
|
|
|
|
|
+ next unless link
|
|
|
|
|
|
|
|
- # On-notice
|
|
|
|
|
- on_raw = extract_on_notice_raw(text_for_parse)
|
|
|
|
|
- on_dt = Util.parse_aus_date(on_raw)
|
|
|
|
|
|
|
+ strong = node.at_css("strong")
|
|
|
|
|
+ label = (strong || link).text.gsub(/\u00a0|\s+/, " ").strip
|
|
|
|
|
+ # e.g. "PLN-26-0030 - 13 Murray Street, Evandale:"
|
|
|
|
|
|
|
|
- # Description
|
|
|
|
|
- description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
|
|
|
|
|
- $1.strip
|
|
|
|
|
- else
|
|
|
|
|
- "Development Application"
|
|
|
|
|
- end
|
|
|
|
|
|
|
+ ref = label.match(REF_RX)&.[](0)
|
|
|
|
|
+ next unless ref
|
|
|
|
|
|
|
|
- next if ref.nil? || address.to_s.strip.empty?
|
|
|
|
|
|
|
+ # Address: everything after "PLN-XX-XXXX - " with trailing colon stripped
|
|
|
|
|
+ address = label.sub(/\APLN-\d{2}-\d{4}\s*-\s*/i, "").sub(/:?\s*\z/, "").strip
|
|
|
|
|
+ next if address.empty?
|
|
|
|
|
|
|
|
- rows << {
|
|
|
|
|
- council_reference: ref,
|
|
|
|
|
- address: address.to_s.strip,
|
|
|
|
|
- description: description,
|
|
|
|
|
- date_received: on_dt,
|
|
|
|
|
- date_received_raw: on_raw,
|
|
|
|
|
- document_url: document_url,
|
|
|
|
|
- title_reference: title_reference
|
|
|
|
|
- }
|
|
|
|
|
- end
|
|
|
|
|
|
|
+ # Remainder of the <p> text (outside the link/strong) gives description + CT
|
|
|
|
|
+ remainder = node.text.sub(label, "").gsub(/\u00a0|\s+/, " ").strip
|
|
|
|
|
+ # e.g. "(CT 189429/1) - Multiple Dwelling (1 existing 1 new manager's residence)"
|
|
|
|
|
|
|
|
- # 2) If the page uses a two-column details table, pick that up too
|
|
|
|
|
- doc.css("table").each do |t|
|
|
|
|
|
- heads = t.css("th").map { |th| th.text.strip.downcase }
|
|
|
|
|
- next unless heads.any? { |h| h.match?(/application|reference|proposal|address|notice|closing/) }
|
|
|
|
|
-
|
|
|
|
|
- t.css("tr").each do |tr|
|
|
|
|
|
- cells = tr.css("td")
|
|
|
|
|
- next unless cells.length >= 2
|
|
|
|
|
- row_text = tr.text.to_s.strip.gsub(/\s+/, " ")
|
|
|
|
|
- ref = extract_ref(row_text)
|
|
|
|
|
- addr = row_text[/address[:\s]+(.+?)(?:\s{2,}|$)/i, 1] || row_text[0, 140]
|
|
|
|
|
- on_raw = extract_on_notice_raw(row_text)
|
|
|
|
|
- on_dt = Util.parse_aus_date(on_raw)
|
|
|
|
|
- next if ref.nil? || addr.to_s.strip.empty?
|
|
|
|
|
- rows << {
|
|
|
|
|
- council_reference: ref,
|
|
|
|
|
- address: addr.to_s.strip,
|
|
|
|
|
- description: "Development Application",
|
|
|
|
|
- date_received: on_dt,
|
|
|
|
|
- date_received_raw: on_raw,
|
|
|
|
|
- document_url: "",
|
|
|
|
|
- title_reference: row_text[0,200]
|
|
|
|
|
- }
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- rows.uniq { |r| [r[:council_reference], r[:address]] }
|
|
|
|
|
-end
|
|
|
|
|
|
|
+ title_reference = remainder.match(/CT\s+([\d\/]+)/i)&.[](1).to_s
|
|
|
|
|
+ description = remainder.sub(/\A\s*\(CT[^)]*\)\s*-?\s*/i, "").strip
|
|
|
|
|
+ description = "Development Application" if description.empty?
|
|
|
|
|
|
|
|
-if URL.empty?
|
|
|
|
|
- Log.warn "scraper", "NORTHERN_MIDLANDS_URL is not set. Example:\n ONLY=northernmidlands NORTHERN_MIDLANDS_URL='https://.../advertised-applications' docker compose run --rm scraper /app/run_all.sh"
|
|
|
|
|
- exit 0
|
|
|
|
|
-end
|
|
|
|
|
|
|
+ document_url = abs_url(URL, link["href"].to_s)
|
|
|
|
|
|
|
|
-begin
|
|
|
|
|
- html = if URL.include?("/eservice/")
|
|
|
|
|
- # Some councils use ePathway, which needs a cookie-warmed session
|
|
|
|
|
- Http.dorset_session_get(URL)
|
|
|
|
|
- else
|
|
|
|
|
- Http.get(URL)
|
|
|
|
|
- end
|
|
|
|
|
-rescue StandardError => e
|
|
|
|
|
- Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}"
|
|
|
|
|
- exit 1
|
|
|
|
|
|
|
+ items << {
|
|
|
|
|
+ council_reference: ref,
|
|
|
|
|
+ address: address,
|
|
|
|
|
+ description: description,
|
|
|
|
|
+ on_notice_to: closing_date,
|
|
|
|
|
+ on_notice_to_raw: closing_date_raw,
|
|
|
|
|
+ title_reference: title_reference,
|
|
|
|
|
+ document_url: document_url
|
|
|
|
|
+ }
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-doc = Nokogiri::HTML(html)
|
|
|
|
|
-items = parse_items(doc, URL)
|
|
|
|
|
-
|
|
|
|
|
puts "Found #{items.length} item(s) for #{TABLE}"
|
|
puts "Found #{items.length} item(s) for #{TABLE}"
|
|
|
|
|
|
|
|
items.each do |r|
|
|
items.each do |r|
|
|
|
upsert_and_enrich!(
|
|
upsert_and_enrich!(
|
|
|
table: TABLE,
|
|
table: TABLE,
|
|
|
row: {
|
|
row: {
|
|
|
- description: r[:description],
|
|
|
|
|
- date_received: r[:date_received],
|
|
|
|
|
- date_received_raw: r[:date_received_raw],
|
|
|
|
|
- address: r[:address],
|
|
|
|
|
council_reference: r[:council_reference],
|
|
council_reference: r[:council_reference],
|
|
|
- applicant: "",
|
|
|
|
|
- owner: ""
|
|
|
|
|
|
|
+ address: r[:address],
|
|
|
|
|
+ description: r[:description],
|
|
|
|
|
+ on_notice_to: r[:on_notice_to],
|
|
|
|
|
+ on_notice_to_raw: r[:on_notice_to_raw],
|
|
|
|
|
+ title_reference: r[:title_reference],
|
|
|
|
|
+ applicant: "",
|
|
|
|
|
+ owner: ""
|
|
|
},
|
|
},
|
|
|
extras: {
|
|
extras: {
|
|
|
- document_url: r[:document_url],
|
|
|
|
|
- on_notice_to: r[:date_received],
|
|
|
|
|
- on_notice_to_raw: r[:date_received_raw],
|
|
|
|
|
- title_reference: r[:title_reference]
|
|
|
|
|
|
|
+ document_url: r[:document_url]
|
|
|
}
|
|
}
|
|
|
)
|
|
)
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-puts "Done #{TABLE}."
|
|
|
|
|
|
|
+puts "Done #{TABLE}. Saved #{items.length} item(s)."
|