Ver código fonte

Kentish Updates

Benjamin Harris 2 meses atrás
pai
commit
97a956e8da
1 arquivos alterados com 63 adições e 114 exclusões
  1. 63 114
      scrapers/kentish.rb

+ 63 - 114
scrapers/kentish.rb

@@ -14,113 +14,58 @@ URL   = "https://www.kentish.tas.gov.au/services/building-and-planning-services/
 DB.ensure_table!(TABLE)
 
 def abs_url(base, href)
-  return "" if href.to_s.strip.empty?
-  URI.join(base, href).to_s rescue href.to_s
+  h = href.to_s.strip
+  return nil if h.empty?
+  return h if h.start_with?(“http://”, “https://”)
+  URI.join(base, h).to_s
+rescue URI::InvalidURIError
+  h
 end
 
-# Reference formats like:
-#   DA 2025/00123
-#   DA2025/00123
-#   Application No. DA 2025/123
-REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i      # DA 2025/0123
-REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i          # DA2025-0123 or DA2025/0123
-REF_RX3 = %r{\bDA\s*([0-9]{1,4})\s*-\s*(20\d{2})\b}i           # DA 114-2025
-
-def extract_ref(str)
-  s = CGI.unescape(str.to_s)
-  if (m = s.match(REF_RX1))
-    return "DA #{m[1]} / #{m[2]}"
-  end
-  if (m = s.match(REF_RX2))
-    return "DA #{m[1]} / #{m[2]}"
-  end
-  if (m = s.match(REF_RX3))
-    return "DA #{m[2]} / #{m[1]}"
-  end
-  nil
-end
-
-DATE_RX = /
-  (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
-   \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
-   \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
-/x
-
-def extract_on_notice_raw(text)
-  s = text.to_s.gsub(/\s+/, " ")
-
-  if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
-    if (d = m[2].match(DATE_RX))
-      return d[1]
-    end
-  end
-
-  if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
-    if (d = m[2].match(DATE_RX))
-      return d[1]
-    end
-  end
+# Kentish uses K-DA{number}/{year} format, e.g. K-DA016/2026
+REF_RX = /\bK-DA\d+\/20\d{2}\b/i
 
-  if (d = s.match(DATE_RX))
-    return d[1]
-  end
+def parse_items(doc, base_url)
+  rows = []
 
-  ""
-end
+  # Each DA is a <li class=”generic-list__item”> with a PDF link in the title
+  # Link text: “K-DA016/2026 41 George Road, Nook - proposed 2 Lot Subdivision (submissions by 21/04/2026)”
+  doc.css(“li.generic-list__item”).each do |li|
+    link = li.at_css(“h3.generic-list__title a, a[href$='.pdf']”)
+    next unless link
 
-def first_meaningful_text(node)
-  return "" unless node
-  t = node.text.to_s.strip.gsub(/\s+/, " ")
-  t
-end
+    raw_text = link.text.gsub(/\(PDF File[^)]*\)/i, “”).gsub(/\s+/, “ “).strip
+    pdf_href = link[“href”].to_s
 
-def nearest_context_text(a)
-  host = a.ancestors("li, p, div, tr").first || a.parent
-  first_meaningful_text(host)
-end
+    ref_match = raw_text.match(REF_RX)
+    next unless ref_match
 
-def parse_document_list(doc, base_url)
-  # Look for clear “items”: pdf links, or list/table rows containing one
-  anchors = doc.css("a").select { |a|
-    href = a["href"].to_s
-    a.text.to_s.strip.match?(/application|permit|advertis/i) || href.downcase.end_with?(".pdf")
-  }
+    ref  = ref_match[0]
+    rest = raw_text.sub(ref, “”).strip
 
-  rows = []
-  anchors.each do |a|
-    href = a["href"].to_s
-    pdf  = abs_url(base_url, href)
-    ctx  = nearest_context_text(a)
-    link_text = a.text.to_s.strip
-
-    text_for_parse = [link_text, ctx].uniq.join(" — ")
-
-    # Try to pull fields
-    ref  = extract_ref(text_for_parse)
-    addr = if link_text.length > 6
-      link_text
-    else
-      ctx[0, 140]
-    end
-
-    on_raw = extract_on_notice_raw(text_for_parse)
+    # Extract on-notice date: “(submissions by 21/04/2026)”
+    on_raw = rest[/\(submissions\s+by\s+([^)]+)\)/i, 1]&.strip || “”
     on_dt  = Util.parse_aus_date(on_raw)
 
-    desc = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
-      $1.strip
+    # Strip the on-notice clause and split “address - description”
+    body = rest.sub(/\s*\(submissions\s+by\s+[^)]+\)/i, “”).strip
+    if (m = body.match(/\A(.+?)\s+-\s+(.+)\z/))
+      address     = m[1].strip
+      description = m[2].strip
     else
-      "Development Application"
+      address     = body
+      description = “Development Application”
     end
 
-    next if ref.nil? || addr.to_s.strip.empty?
+    next if address.empty?
 
     rows << {
       council_reference: ref,
-      address: addr.to_s.strip,
-      description: desc,
-      date_received: on_dt,
-      date_received_raw: on_raw,
-      document_url: pdf
+      address:           address[0, 255],
+      description:       description,
+      on_notice_to:      on_dt,
+      on_notice_to_raw:  on_raw,
+      document_url:      abs_url(base_url, pdf_href)
     }
   end
 
@@ -144,32 +89,36 @@ if html.include?("Just a moment") || html.include?("Enable JavaScript and cookie
   exit 0
 end
 
-doc = Nokogiri::HTML(html)
-items = parse_document_list(doc, URL)
+doc   = Nokogiri::HTML(html)
+items = parse_items(doc, URL)
 
 puts "Found #{items.length} item(s) for #{TABLE}"
 
+saved = 0
 items.each do |r|
-  DB.upsert(TABLE, {
-    description:       r[:description],
-    date_received:     r[:date_received],
-    date_received_raw: r[:date_received_raw],
-    on_notice_to:      r[:date_received],
-    on_notice_to_raw:  r[:date_received_raw],
-    address:           r[:address],
-    council_reference: r[:council_reference],
-    document_url:      r[:document_url],
-    applicant:         "",
-    owner:             ""
-  })
-
-  enrich_after_upsert!(
-    table:             TABLE,
-    council_reference: r[:council_reference],
-    address:           r[:address]
-  )
-
-  puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
+  begin
+    DB.upsert(TABLE, {
+      description:       r[:description],
+      on_notice_to:      r[:on_notice_to],
+      on_notice_to_raw:  r[:on_notice_to_raw],
+      address:           r[:address],
+      council_reference: r[:council_reference],
+      document_url:      r[:document_url],
+      applicant:         "",
+      owner:             ""
+    })
+
+    enrich_after_upsert!(
+      table:             TABLE,
+      council_reference: r[:council_reference],
+      address:           r[:address]
+    )
+
+    Log.info "kentish", "Upserted #{r[:council_reference]} -> #{r[:address]}"
+    saved += 1
+  rescue StandardError => e
+    Log.warn "kentish", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
+  end
 end
 
-puts "Done #{TABLE}."
+puts "Done #{TABLE}. Saved #{saved} item(s)."