2 luni în urmă · 1035f8ddd9
--- a/scrapers/northernmidlands.rb
+++ b/scrapers/northernmidlands.rb
@@ -1,186 +1,102 @@
 
															-# Northern Midlands Council — Advertised / Planning Applications (site page)
														
 
															+# Northern Midlands Council — Advertised Planning Applications
														
 
															+#
														
 
															+# Source: https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2
														
 
															+#
														
 
															+# Page structure:
														
 
															+#   <h2>Closing 17 April 2026</h2>
														
 
															+#   <p>
														
 
															+#     <a href="...pdf"><strong>PLN-26-0030 - 13 Murray Street, Evandale:</strong></a>
														
 
															+#     (CT 21/1332) - Subdivision (2 Lot)
														
 
															+#   </p>
														
 
															 require "nokogiri"
														
 
															 require "uri"
														
 
															-require "cgi"
														
 
															 require_relative "../lib/scraper_helpers"
														
 
															 require_relative "../lib/util"
														
 
															 require_relative "../lib/log"
														
 
															-TABLE = ENV.fetch("TABLE_NAME")                    # run_all.sh -> da_northernmidlands
														
 
															+
														
 
															+TABLE = ENV.fetch("TABLE_NAME")
														
 
															 URL   = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
														
 
															 DB.ensure_table!(TABLE)
														
 
															-# "DA 2025/00123", "DA2025/00123", "Application No. DA 2025/123"
														
 
															-REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
														
 
															-REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i
														
 
															-
														
 
															-def extract_ref(str)
														
 
															-  s = CGI.unescape(str.to_s)
														
 
															-  if (m = s.match(REF_RX1))
														
 
															-    return "DA #{m[1]} / #{m[2]}"
														
 
															-  end
														
 
															-  if (m = s.match(REF_RX2))
														
 
															-    return "DA #{m[1]} / #{m[2]}"
														
 
															-  end
														
 
															-  nil
														
 
															-end
														
 
															-
														
 
															-DATE_RX = /
														
 
															-  (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
														
 
															-   \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
														
 
															-   \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
														
 
															-/x
														
 
															-
														
 
															-def extract_on_notice_raw(text)
														
 
															-  s = text.to_s.gsub(/\s+/, " ")
														
 
															-  if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
														
 
															-    if (d = m[2].match(DATE_RX))
														
 
															-      return d[1]
														
 
															-    end
														
 
															-  end
														
 
															-  if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
														
 
															-    if (d = m[2].match(DATE_RX))
														
 
															-      return d[1]
														
 
															-    end
														
 
															-  end
														
 
															-  if (d = s.match(DATE_RX))
														
 
															-    return d[1]
														
 
															-  end
														
 
															-  ""
														
 
															-end
														
 
															-
														
 
															-
														
 
															-def nearest_context_text(a)
														
 
															-  host = a.ancestors("li, p, div, tr, article").first || a.parent
														
 
															-  host ? host.text.to_s.strip.gsub(/\s+/, " ") : ""
														
 
															-end
														
 
															+REF_RX = /\bPLN-\d{2}-\d{4}\b/i
														
 
															-def parse_items(doc, base_url)
														
 
															-  rows = []
														
 
															+html = Http.get(URL)
														
 
															+doc  = Nokogiri::HTML(html)
														
 
															-  # 1) Obvious list items or rows with PDFs or application keywords
														
 
															-  anchors = doc.css("a").select { |a|
														
 
															-    href = a["href"].to_s
														
 
															-    a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf")
														
 
															-  }
														
 
															+items = []
														
 
															+closing_date     = nil
														
 
															+closing_date_raw = ""
														
 
															-  anchors.each do |a|
														
 
															-    href = a["href"].to_s
														
 
															-    link_text = a.text.to_s.strip
														
 
															-    document_url = abs_url(base_url, href)
														
 
															-    ctx = nearest_context_text(a)
														
 
															-
														
 
															-    # Title to keep, if present
														
 
															-    title_reference = link_text.empty? ? ctx[0,200] : link_text
														
 
															-
														
 
															-    text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ")
														
 
															-
														
 
															-    # Address: prefer the link text, else the surrounding text slice
														
 
															-    address = if link_text.length >= 6
														
 
															-      link_text
														
 
															-    else
														
 
															-      ctx[0, 140]
														
 
															+# Walk nodes in document order so h2 headings set the closing date for
														
 
															+# the <p> entries that follow them.
														
 
															+doc.css("h2, p").each do |node|
														
 
															+  if node.name == "h2"
														
 
															+    text = node.text.gsub(/\u00a0|\s+/, " ").strip
														
 
															+    if (m = text.match(/Closing\s+(.+)/i))
														
 
															+      closing_date_raw = m[1].strip
														
 
															+      closing_date     = Util.parse_aus_date(closing_date_raw)
														
 
															     end
														
 
															+    next
														
 
															+  end
														
 
															-    # Reference from text or file name
														
 
															-    ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url))
														
 
															+  # <p> — look for a PLN reference inside a link
														
 
															+  link = node.at_css("a[href]")
														
 
															+  next unless link
														
 
															-    # On-notice
														
 
															-    on_raw = extract_on_notice_raw(text_for_parse)
														
 
															-    on_dt  = Util.parse_aus_date(on_raw)
														
 
															+  strong = node.at_css("strong")
														
 
															+  label  = (strong || link).text.gsub(/\u00a0|\s+/, " ").strip
														
 
															+  # e.g. "PLN-26-0030 - 13 Murray Street, Evandale:"
														
 
															-    # Description
														
 
															-    description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
														
 
															-      $1.strip
														
 
															-    else
														
 
															-      "Development Application"
														
 
															-    end
														
 
															+  ref = label.match(REF_RX)&.[](0)
														
 
															+  next unless ref
														
 
															-    next if ref.nil? || address.to_s.strip.empty?
														
 
															+  # Address: everything after "PLN-XX-XXXX - " with trailing colon stripped
														
 
															+  address = label.sub(/\APLN-\d{2}-\d{4}\s*-\s*/i, "").sub(/:?\s*\z/, "").strip
														
 
															+  next if address.empty?
														
 
															-    rows << {
														
 
															-      council_reference: ref,
														
 
															-      address: address.to_s.strip,
														
 
															-      description: description,
														
 
															-      date_received: on_dt,
														
 
															-      date_received_raw: on_raw,
														
 
															-      document_url: document_url,
														
 
															-      title_reference: title_reference
														
 
															-    }
														
 
															-  end
														
 
															+  # Remainder of the <p> text (outside the link/strong) gives description + CT
														
 
															+  remainder = node.text.sub(label, "").gsub(/\u00a0|\s+/, " ").strip
														
 
															+  # e.g. "(CT 189429/1) - Multiple Dwelling (1 existing 1 new manager's residence)"
														
 
															-  # 2) If the page uses a two-column details table, pick that up too
														
 
															-  doc.css("table").each do |t|
														
 
															-    heads = t.css("th").map { |th| th.text.strip.downcase }
														
 
															-    next unless heads.any? { |h| h.match?(/application|reference|proposal|address|notice|closing/) }
														
 
															-
														
 
															-    t.css("tr").each do |tr|
														
 
															-      cells = tr.css("td")
														
 
															-      next unless cells.length >= 2
														
 
															-      row_text = tr.text.to_s.strip.gsub(/\s+/, " ")
														
 
															-      ref = extract_ref(row_text)
														
 
															-      addr = row_text[/address[:\s]+(.+?)(?:\s{2,}|$)/i, 1] || row_text[0, 140]
														
 
															-      on_raw = extract_on_notice_raw(row_text)
														
 
															-      on_dt  = Util.parse_aus_date(on_raw)
														
 
															-      next if ref.nil? || addr.to_s.strip.empty?
														
 
															-      rows << {
														
 
															-        council_reference: ref,
														
 
															-        address: addr.to_s.strip,
														
 
															-        description: "Development Application",
														
 
															-        date_received: on_dt,
														
 
															-        date_received_raw: on_raw,
														
 
															-        document_url: "",
														
 
															-        title_reference: row_text[0,200]
														
 
															-      }
														
 
															-    end
														
 
															-  end
														
 
															-
														
 
															-  rows.uniq { |r| [r[:council_reference], r[:address]] }
														
 
															-end
														
 
															+  title_reference = remainder.match(/CT\s+([\d\/]+)/i)&.[](1).to_s
														
 
															+  description     = remainder.sub(/\A\s*\(CT[^)]*\)\s*-?\s*/i, "").strip
														
 
															+  description     = "Development Application" if description.empty?
														
 
															-if URL.empty?
														
 
															-  Log.warn "scraper", "NORTHERN_MIDLANDS_URL is not set. Example:\n  ONLY=northernmidlands NORTHERN_MIDLANDS_URL='https://.../advertised-applications' docker compose run --rm scraper /app/run_all.sh"
														
 
															-  exit 0
														
 
															-end
														
 
															+  document_url = abs_url(URL, link["href"].to_s)
														
 
															-begin
														
 
															-  html = if URL.include?("/eservice/")
														
 
															-    # Some councils use ePathway, which needs a cookie-warmed session
														
 
															-    Http.dorset_session_get(URL)
														
 
															-  else
														
 
															-    Http.get(URL)
														
 
															-  end
														
 
															-rescue StandardError => e
														
 
															-  Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}"
														
 
															-  exit 1
														
 
															+  items << {
														
 
															+    council_reference: ref,
														
 
															+    address:           address,
														
 
															+    description:       description,
														
 
															+    on_notice_to:      closing_date,
														
 
															+    on_notice_to_raw:  closing_date_raw,
														
 
															+    title_reference:   title_reference,
														
 
															+    document_url:      document_url
														
 
															+  }
														
 
															 end
														
 
															-doc = Nokogiri::HTML(html)
														
 
															-items = parse_items(doc, URL)
														
 
															-
														
 
															 puts "Found #{items.length} item(s) for #{TABLE}"
														
 
															 items.each do |r|
														
 
															   upsert_and_enrich!(
														
 
															     table: TABLE,
														
 
															     row: {
														
 
															-      description: r[:description],
														
 
															-      date_received: r[:date_received],
														
 
															-      date_received_raw: r[:date_received_raw],
														
 
															-      address: r[:address],
														
 
															       council_reference: r[:council_reference],
														
 
															-      applicant: "",
														
 
															-      owner: ""
														
 
															+      address:           r[:address],
														
 
															+      description:       r[:description],
														
 
															+      on_notice_to:      r[:on_notice_to],
														
 
															+      on_notice_to_raw:  r[:on_notice_to_raw],
														
 
															+      title_reference:   r[:title_reference],
														
 
															+      applicant:         "",
														
 
															+      owner:             ""
														
 
															     },
														
 
															     extras: {
														
 
															-      document_url:     r[:document_url],
														
 
															-      on_notice_to:     r[:date_received],
														
 
															-      on_notice_to_raw: r[:date_received_raw],
														
 
															-      title_reference:  r[:title_reference]
														
 
															+      document_url: r[:document_url]
														
 
															     }
														
 
															   )
														
 
															 end
														
 
															-puts "Done #{TABLE}."
														
 
															+puts "Done #{TABLE}. Saved #{items.length} item(s)."