2 månader sedan · 1035f8ddd9
--- a/scrapers/northernmidlands.rb
+++ b/scrapers/northernmidlands.rb
@@ -1,186 +1,102 @@
 
				-# Northern Midlands Council — Advertised / Planning Applications (site page)
			
 
				+# Northern Midlands Council — Advertised Planning Applications
			
 
				+#
			
 
				+# Source: https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2
			
 
				+#
			
 
				+# Page structure:
			
 
				+#   <h2>Closing 17 April 2026</h2>
			
 
				+#   <p>
			
 
				+#     <a href="...pdf"><strong>PLN-26-0030 - 13 Murray Street, Evandale:</strong></a>
			
 
				+#     (CT 21/1332) - Subdivision (2 Lot)
			
 
				+#   </p>
			
 
				 
			
 
				 require "nokogiri"
			
 
				 require "uri"
			
 
				-require "cgi"
			
 
				 
			
 
				 require_relative "../lib/scraper_helpers"
			
 
				 require_relative "../lib/util"
			
 
				 require_relative "../lib/log"
			
 
				-TABLE = ENV.fetch("TABLE_NAME")                    # run_all.sh -> da_northernmidlands
			
 
				+
			
 
				+TABLE = ENV.fetch("TABLE_NAME")
			
 
				 URL   = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
			
 
				 
			
 
				 DB.ensure_table!(TABLE)
			
 
				 
			
 
				-# "DA 2025/00123", "DA2025/00123", "Application No. DA 2025/123"
			
 
				-REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
			
 
				-REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i
			
 
				-
			
 
				-def extract_ref(str)
			
 
				-  s = CGI.unescape(str.to_s)
			
 
				-  if (m = s.match(REF_RX1))
			
 
				-    return "DA #{m[1]} / #{m[2]}"
			
 
				-  end
			
 
				-  if (m = s.match(REF_RX2))
			
 
				-    return "DA #{m[1]} / #{m[2]}"
			
 
				-  end
			
 
				-  nil
			
 
				-end
			
 
				-
			
 
				-DATE_RX = /
			
 
				-  (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
			
 
				-   \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
			
 
				-   \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
			
 
				-/x
			
 
				-
			
 
				-def extract_on_notice_raw(text)
			
 
				-  s = text.to_s.gsub(/\s+/, " ")
			
 
				-  if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
			
 
				-    if (d = m[2].match(DATE_RX))
			
 
				-      return d[1]
			
 
				-    end
			
 
				-  end
			
 
				-  if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
			
 
				-    if (d = m[2].match(DATE_RX))
			
 
				-      return d[1]
			
 
				-    end
			
 
				-  end
			
 
				-  if (d = s.match(DATE_RX))
			
 
				-    return d[1]
			
 
				-  end
			
 
				-  ""
			
 
				-end
			
 
				-
			
 
				-
			
 
				-def nearest_context_text(a)
			
 
				-  host = a.ancestors("li, p, div, tr, article").first || a.parent
			
 
				-  host ? host.text.to_s.strip.gsub(/\s+/, " ") : ""
			
 
				-end
			
 
				+REF_RX = /\bPLN-\d{2}-\d{4}\b/i
			
 
				 
			
 
				-def parse_items(doc, base_url)
			
 
				-  rows = []
			
 
				+html = Http.get(URL)
			
 
				+doc  = Nokogiri::HTML(html)
			
 
				 
			
 
				-  # 1) Obvious list items or rows with PDFs or application keywords
			
 
				-  anchors = doc.css("a").select { |a|
			
 
				-    href = a["href"].to_s
			
 
				-    a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf")
			
 
				-  }
			
 
				+items = []
			
 
				+closing_date     = nil
			
 
				+closing_date_raw = ""
			
 
				 
			
 
				-  anchors.each do |a|
			
 
				-    href = a["href"].to_s
			
 
				-    link_text = a.text.to_s.strip
			
 
				-    document_url = abs_url(base_url, href)
			
 
				-    ctx = nearest_context_text(a)
			
 
				-
			
 
				-    # Title to keep, if present
			
 
				-    title_reference = link_text.empty? ? ctx[0,200] : link_text
			
 
				-
			
 
				-    text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ")
			
 
				-
			
 
				-    # Address: prefer the link text, else the surrounding text slice
			
 
				-    address = if link_text.length >= 6
			
 
				-      link_text
			
 
				-    else
			
 
				-      ctx[0, 140]
			
 
				+# Walk nodes in document order so h2 headings set the closing date for
			
 
				+# the <p> entries that follow them.
			
 
				+doc.css("h2, p").each do |node|
			
 
				+  if node.name == "h2"
			
 
				+    text = node.text.gsub(/\u00a0|\s+/, " ").strip
			
 
				+    if (m = text.match(/Closing\s+(.+)/i))
			
 
				+      closing_date_raw = m[1].strip
			
 
				+      closing_date     = Util.parse_aus_date(closing_date_raw)
			
 
				     end
			
 
				+    next
			
 
				+  end
			
 
				 
			
 
				-    # Reference from text or file name
			
 
				-    ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url))
			
 
				+  # <p> — look for a PLN reference inside a link
			
 
				+  link = node.at_css("a[href]")
			
 
				+  next unless link
			
 
				 
			
 
				-    # On-notice
			
 
				-    on_raw = extract_on_notice_raw(text_for_parse)
			
 
				-    on_dt  = Util.parse_aus_date(on_raw)
			
 
				+  strong = node.at_css("strong")
			
 
				+  label  = (strong || link).text.gsub(/\u00a0|\s+/, " ").strip
			
 
				+  # e.g. "PLN-26-0030 - 13 Murray Street, Evandale:"
			
 
				 
			
 
				-    # Description
			
 
				-    description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
			
 
				-      $1.strip
			
 
				-    else
			
 
				-      "Development Application"
			
 
				-    end
			
 
				+  ref = label.match(REF_RX)&.[](0)
			
 
				+  next unless ref
			
 
				 
			
 
				-    next if ref.nil? || address.to_s.strip.empty?
			
 
				+  # Address: everything after "PLN-XX-XXXX - " with trailing colon stripped
			
 
				+  address = label.sub(/\APLN-\d{2}-\d{4}\s*-\s*/i, "").sub(/:?\s*\z/, "").strip
			
 
				+  next if address.empty?
			
 
				 
			
 
				-    rows << {
			
 
				-      council_reference: ref,
			
 
				-      address: address.to_s.strip,
			
 
				-      description: description,
			
 
				-      date_received: on_dt,
			
 
				-      date_received_raw: on_raw,
			
 
				-      document_url: document_url,
			
 
				-      title_reference: title_reference
			
 
				-    }
			
 
				-  end
			
 
				+  # Remainder of the <p> text (outside the link/strong) gives description + CT
			
 
				+  remainder = node.text.sub(label, "").gsub(/\u00a0|\s+/, " ").strip
			
 
				+  # e.g. "(CT 189429/1) - Multiple Dwelling (1 existing 1 new manager's residence)"
			
 
				 
			
 
				-  # 2) If the page uses a two-column details table, pick that up too
			
 
				-  doc.css("table").each do |t|
			
 
				-    heads = t.css("th").map { |th| th.text.strip.downcase }
			
 
				-    next unless heads.any? { |h| h.match?(/application|reference|proposal|address|notice|closing/) }
			
 
				-
			
 
				-    t.css("tr").each do |tr|
			
 
				-      cells = tr.css("td")
			
 
				-      next unless cells.length >= 2
			
 
				-      row_text = tr.text.to_s.strip.gsub(/\s+/, " ")
			
 
				-      ref = extract_ref(row_text)
			
 
				-      addr = row_text[/address[:\s]+(.+?)(?:\s{2,}|$)/i, 1] || row_text[0, 140]
			
 
				-      on_raw = extract_on_notice_raw(row_text)
			
 
				-      on_dt  = Util.parse_aus_date(on_raw)
			
 
				-      next if ref.nil? || addr.to_s.strip.empty?
			
 
				-      rows << {
			
 
				-        council_reference: ref,
			
 
				-        address: addr.to_s.strip,
			
 
				-        description: "Development Application",
			
 
				-        date_received: on_dt,
			
 
				-        date_received_raw: on_raw,
			
 
				-        document_url: "",
			
 
				-        title_reference: row_text[0,200]
			
 
				-      }
			
 
				-    end
			
 
				-  end
			
 
				-
			
 
				-  rows.uniq { |r| [r[:council_reference], r[:address]] }
			
 
				-end
			
 
				+  title_reference = remainder.match(/CT\s+([\d\/]+)/i)&.[](1).to_s
			
 
				+  description     = remainder.sub(/\A\s*\(CT[^)]*\)\s*-?\s*/i, "").strip
			
 
				+  description     = "Development Application" if description.empty?
			
 
				 
			
 
				-if URL.empty?
			
 
				-  Log.warn "scraper", "NORTHERN_MIDLANDS_URL is not set. Example:\n  ONLY=northernmidlands NORTHERN_MIDLANDS_URL='https://.../advertised-applications' docker compose run --rm scraper /app/run_all.sh"
			
 
				-  exit 0
			
 
				-end
			
 
				+  document_url = abs_url(URL, link["href"].to_s)
			
 
				 
			
 
				-begin
			
 
				-  html = if URL.include?("/eservice/")
			
 
				-    # Some councils use ePathway, which needs a cookie-warmed session
			
 
				-    Http.dorset_session_get(URL)
			
 
				-  else
			
 
				-    Http.get(URL)
			
 
				-  end
			
 
				-rescue StandardError => e
			
 
				-  Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}"
			
 
				-  exit 1
			
 
				+  items << {
			
 
				+    council_reference: ref,
			
 
				+    address:           address,
			
 
				+    description:       description,
			
 
				+    on_notice_to:      closing_date,
			
 
				+    on_notice_to_raw:  closing_date_raw,
			
 
				+    title_reference:   title_reference,
			
 
				+    document_url:      document_url
			
 
				+  }
			
 
				 end
			
 
				 
			
 
				-doc = Nokogiri::HTML(html)
			
 
				-items = parse_items(doc, URL)
			
 
				-
			
 
				 puts "Found #{items.length} item(s) for #{TABLE}"
			
 
				 
			
 
				 items.each do |r|
			
 
				   upsert_and_enrich!(
			
 
				     table: TABLE,
			
 
				     row: {
			
 
				-      description: r[:description],
			
 
				-      date_received: r[:date_received],
			
 
				-      date_received_raw: r[:date_received_raw],
			
 
				-      address: r[:address],
			
 
				       council_reference: r[:council_reference],
			
 
				-      applicant: "",
			
 
				-      owner: ""
			
 
				+      address:           r[:address],
			
 
				+      description:       r[:description],
			
 
				+      on_notice_to:      r[:on_notice_to],
			
 
				+      on_notice_to_raw:  r[:on_notice_to_raw],
			
 
				+      title_reference:   r[:title_reference],
			
 
				+      applicant:         "",
			
 
				+      owner:             ""
			
 
				     },
			
 
				     extras: {
			
 
				-      document_url:     r[:document_url],
			
 
				-      on_notice_to:     r[:date_received],
			
 
				-      on_notice_to_raw: r[:date_received_raw],
			
 
				-      title_reference:  r[:title_reference]
			
 
				+      document_url: r[:document_url]
			
 
				     }
			
 
				   )
			
 
				 end
			
 
				 
			
 
				-puts "Done #{TABLE}."
			
 
				+puts "Done #{TABLE}. Saved #{items.length} item(s)."