Bläddra i källkod

northern midlands

Benjamin Harris 2 månader sedan
förälder
incheckning
1035f8ddd9
1 ändrade filer med 64 tillägg och 148 borttagningar
  1. 64 148
      scrapers/northernmidlands.rb

+ 64 - 148
scrapers/northernmidlands.rb

@@ -1,186 +1,102 @@
-# Northern Midlands Council — Advertised / Planning Applications (site page)
+# Northern Midlands Council — Advertised Planning Applications
+#
+# Source: https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2
+#
+# Page structure:
+#   <h2>Closing 17 April 2026</h2>
+#   <p>
+#     <a href="...pdf"><strong>PLN-26-0030 - 13 Murray Street, Evandale:</strong></a>
+#     (CT 21/1332) - Subdivision (2 Lot)
+#   </p>
 
 require "nokogiri"
 require "uri"
-require "cgi"
 
 require_relative "../lib/scraper_helpers"
 require_relative "../lib/util"
 require_relative "../lib/log"
-TABLE = ENV.fetch("TABLE_NAME")                    # run_all.sh -> da_northernmidlands
+
+TABLE = ENV.fetch("TABLE_NAME")
 URL   = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
 
 DB.ensure_table!(TABLE)
 
-# "DA 2025/00123", "DA2025/00123", "Application No. DA 2025/123"
-REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
-REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i
-
-def extract_ref(str)
-  s = CGI.unescape(str.to_s)
-  if (m = s.match(REF_RX1))
-    return "DA #{m[1]} / #{m[2]}"
-  end
-  if (m = s.match(REF_RX2))
-    return "DA #{m[1]} / #{m[2]}"
-  end
-  nil
-end
-
-DATE_RX = /
-  (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
-   \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
-   \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
-/x
-
-def extract_on_notice_raw(text)
-  s = text.to_s.gsub(/\s+/, " ")
-  if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
-    if (d = m[2].match(DATE_RX))
-      return d[1]
-    end
-  end
-  if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
-    if (d = m[2].match(DATE_RX))
-      return d[1]
-    end
-  end
-  if (d = s.match(DATE_RX))
-    return d[1]
-  end
-  ""
-end
-
-
-def nearest_context_text(a)
-  host = a.ancestors("li, p, div, tr, article").first || a.parent
-  host ? host.text.to_s.strip.gsub(/\s+/, " ") : ""
-end
+REF_RX = /\bPLN-\d{2}-\d{4}\b/i
 
-def parse_items(doc, base_url)
-  rows = []
+html = Http.get(URL)
+doc  = Nokogiri::HTML(html)
 
-  # 1) Obvious list items or rows with PDFs or application keywords
-  anchors = doc.css("a").select { |a|
-    href = a["href"].to_s
-    a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf")
-  }
+items = []
+closing_date     = nil
+closing_date_raw = ""
 
-  anchors.each do |a|
-    href = a["href"].to_s
-    link_text = a.text.to_s.strip
-    document_url = abs_url(base_url, href)
-    ctx = nearest_context_text(a)
-
-    # Title to keep, if present
-    title_reference = link_text.empty? ? ctx[0,200] : link_text
-
-    text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ")
-
-    # Address: prefer the link text, else the surrounding text slice
-    address = if link_text.length >= 6
-      link_text
-    else
-      ctx[0, 140]
+# Walk nodes in document order so h2 headings set the closing date for
+# the <p> entries that follow them.
+doc.css("h2, p").each do |node|
+  if node.name == "h2"
+    text = node.text.gsub(/\u00a0|\s+/, " ").strip
+    if (m = text.match(/Closing\s+(.+)/i))
+      closing_date_raw = m[1].strip
+      closing_date     = Util.parse_aus_date(closing_date_raw)
     end
+    next
+  end
 
-    # Reference from text or file name
-    ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url))
+  # <p> — look for a PLN reference inside a link
+  link = node.at_css("a[href]")
+  next unless link
 
-    # On-notice
-    on_raw = extract_on_notice_raw(text_for_parse)
-    on_dt  = Util.parse_aus_date(on_raw)
+  strong = node.at_css("strong")
+  label  = (strong || link).text.gsub(/\u00a0|\s+/, " ").strip
+  # e.g. "PLN-26-0030 - 13 Murray Street, Evandale:"
 
-    # Description
-    description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
-      $1.strip
-    else
-      "Development Application"
-    end
+  ref = label.match(REF_RX)&.[](0)
+  next unless ref
 
-    next if ref.nil? || address.to_s.strip.empty?
+  # Address: everything after "PLN-XX-XXXX - " with trailing colon stripped
+  address = label.sub(/\APLN-\d{2}-\d{4}\s*-\s*/i, "").sub(/:?\s*\z/, "").strip
+  next if address.empty?
 
-    rows << {
-      council_reference: ref,
-      address: address.to_s.strip,
-      description: description,
-      date_received: on_dt,
-      date_received_raw: on_raw,
-      document_url: document_url,
-      title_reference: title_reference
-    }
-  end
+  # Remainder of the <p> text (outside the link/strong) gives description + CT
+  remainder = node.text.sub(label, "").gsub(/\u00a0|\s+/, " ").strip
+  # e.g. "(CT 189429/1) - Multiple Dwelling (1 existing 1 new manager's residence)"
 
-  # 2) If the page uses a two-column details table, pick that up too
-  doc.css("table").each do |t|
-    heads = t.css("th").map { |th| th.text.strip.downcase }
-    next unless heads.any? { |h| h.match?(/application|reference|proposal|address|notice|closing/) }
-
-    t.css("tr").each do |tr|
-      cells = tr.css("td")
-      next unless cells.length >= 2
-      row_text = tr.text.to_s.strip.gsub(/\s+/, " ")
-      ref = extract_ref(row_text)
-      addr = row_text[/address[:\s]+(.+?)(?:\s{2,}|$)/i, 1] || row_text[0, 140]
-      on_raw = extract_on_notice_raw(row_text)
-      on_dt  = Util.parse_aus_date(on_raw)
-      next if ref.nil? || addr.to_s.strip.empty?
-      rows << {
-        council_reference: ref,
-        address: addr.to_s.strip,
-        description: "Development Application",
-        date_received: on_dt,
-        date_received_raw: on_raw,
-        document_url: "",
-        title_reference: row_text[0,200]
-      }
-    end
-  end
-
-  rows.uniq { |r| [r[:council_reference], r[:address]] }
-end
+  title_reference = remainder.match(/CT\s+([\d\/]+)/i)&.[](1).to_s
+  description     = remainder.sub(/\A\s*\(CT[^)]*\)\s*-?\s*/i, "").strip
+  description     = "Development Application" if description.empty?
 
-if URL.empty?
-  Log.warn "scraper", "NORTHERN_MIDLANDS_URL is not set. Example:\n  ONLY=northernmidlands NORTHERN_MIDLANDS_URL='https://.../advertised-applications' docker compose run --rm scraper /app/run_all.sh"
-  exit 0
-end
+  document_url = abs_url(URL, link["href"].to_s)
 
-begin
-  html = if URL.include?("/eservice/")
-    # Some councils use ePathway, which needs a cookie-warmed session
-    Http.dorset_session_get(URL)
-  else
-    Http.get(URL)
-  end
-rescue StandardError => e
-  Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}"
-  exit 1
+  items << {
+    council_reference: ref,
+    address:           address,
+    description:       description,
+    on_notice_to:      closing_date,
+    on_notice_to_raw:  closing_date_raw,
+    title_reference:   title_reference,
+    document_url:      document_url
+  }
 end
 
-doc = Nokogiri::HTML(html)
-items = parse_items(doc, URL)
-
 puts "Found #{items.length} item(s) for #{TABLE}"
 
 items.each do |r|
   upsert_and_enrich!(
     table: TABLE,
     row: {
-      description: r[:description],
-      date_received: r[:date_received],
-      date_received_raw: r[:date_received_raw],
-      address: r[:address],
       council_reference: r[:council_reference],
-      applicant: "",
-      owner: ""
+      address:           r[:address],
+      description:       r[:description],
+      on_notice_to:      r[:on_notice_to],
+      on_notice_to_raw:  r[:on_notice_to_raw],
+      title_reference:   r[:title_reference],
+      applicant:         "",
+      owner:             ""
     },
     extras: {
-      document_url:     r[:document_url],
-      on_notice_to:     r[:date_received],
-      on_notice_to_raw: r[:date_received_raw],
-      title_reference:  r[:title_reference]
+      document_url: r[:document_url]
     }
   )
 end
 
-puts "Done #{TABLE}."
+puts "Done #{TABLE}. Saved #{items.length} item(s)."