|
@@ -14,113 +14,58 @@ URL = "https://www.kentish.tas.gov.au/services/building-and-planning-services/
|
|
|
DB.ensure_table!(TABLE)
|
|
DB.ensure_table!(TABLE)
|
|
|
|
|
|
|
|
def abs_url(base, href)
|
|
def abs_url(base, href)
|
|
|
- return "" if href.to_s.strip.empty?
|
|
|
|
|
- URI.join(base, href).to_s rescue href.to_s
|
|
|
|
|
|
|
+ h = href.to_s.strip
|
|
|
|
|
+ return nil if h.empty?
|
|
|
|
|
+ return h if h.start_with?(“http://”, “https://”)
|
|
|
|
|
+ URI.join(base, h).to_s
|
|
|
|
|
+rescue URI::InvalidURIError
|
|
|
|
|
+ h
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-# Reference formats like:
|
|
|
|
|
-# DA 2025/00123
|
|
|
|
|
-# DA2025/00123
|
|
|
|
|
-# Application No. DA 2025/123
|
|
|
|
|
-REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-\._]+)}i # DA 2025/0123
|
|
|
|
|
-REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i # DA2025-0123 or DA2025/0123
|
|
|
|
|
-REF_RX3 = %r{\bDA\s*([0-9]{1,4})\s*-\s*(20\d{2})\b}i # DA 114-2025
|
|
|
|
|
-
|
|
|
|
|
-def extract_ref(str)
|
|
|
|
|
- s = CGI.unescape(str.to_s)
|
|
|
|
|
- if (m = s.match(REF_RX1))
|
|
|
|
|
- return "DA #{m[1]} / #{m[2]}"
|
|
|
|
|
- end
|
|
|
|
|
- if (m = s.match(REF_RX2))
|
|
|
|
|
- return "DA #{m[1]} / #{m[2]}"
|
|
|
|
|
- end
|
|
|
|
|
- if (m = s.match(REF_RX3))
|
|
|
|
|
- return "DA #{m[2]} / #{m[1]}"
|
|
|
|
|
- end
|
|
|
|
|
- nil
|
|
|
|
|
-end
|
|
|
|
|
-
|
|
|
|
|
-DATE_RX = /
|
|
|
|
|
- (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
|
|
|
|
|
- \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
|
|
|
|
|
- \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
|
|
|
|
|
-/x
|
|
|
|
|
-
|
|
|
|
|
-def extract_on_notice_raw(text)
|
|
|
|
|
- s = text.to_s.gsub(/\s+/, " ")
|
|
|
|
|
-
|
|
|
|
|
- if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
|
|
|
|
|
- if (d = m[2].match(DATE_RX))
|
|
|
|
|
- return d[1]
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
|
|
|
|
|
- if (d = m[2].match(DATE_RX))
|
|
|
|
|
- return d[1]
|
|
|
|
|
- end
|
|
|
|
|
- end
|
|
|
|
|
|
|
+# Kentish uses K-DA{number}/{year} format, e.g. K-DA016/2026
|
|
|
|
|
+REF_RX = /\bK-DA\d+\/20\d{2}\b/i
|
|
|
|
|
|
|
|
- if (d = s.match(DATE_RX))
|
|
|
|
|
- return d[1]
|
|
|
|
|
- end
|
|
|
|
|
|
|
+def parse_items(doc, base_url)
|
|
|
|
|
+ rows = []
|
|
|
|
|
|
|
|
- ""
|
|
|
|
|
-end
|
|
|
|
|
|
|
+ # Each DA is a <li class=”generic-list__item”> with a PDF link in the title
|
|
|
|
|
+ # Link text: “K-DA016/2026 41 George Road, Nook - proposed 2 Lot Subdivision (submissions by 21/04/2026)”
|
|
|
|
|
+ doc.css(“li.generic-list__item”).each do |li|
|
|
|
|
|
+ link = li.at_css(“h3.generic-list__title a, a[href$='.pdf']”)
|
|
|
|
|
+ next unless link
|
|
|
|
|
|
|
|
-def first_meaningful_text(node)
|
|
|
|
|
- return "" unless node
|
|
|
|
|
- t = node.text.to_s.strip.gsub(/\s+/, " ")
|
|
|
|
|
- t
|
|
|
|
|
-end
|
|
|
|
|
|
|
+ raw_text = link.text.gsub(/\(PDF File[^)]*\)/i, “”).gsub(/\s+/, “ “).strip
|
|
|
|
|
+ pdf_href = link[“href”].to_s
|
|
|
|
|
|
|
|
-def nearest_context_text(a)
|
|
|
|
|
- host = a.ancestors("li, p, div, tr").first || a.parent
|
|
|
|
|
- first_meaningful_text(host)
|
|
|
|
|
-end
|
|
|
|
|
|
|
+ ref_match = raw_text.match(REF_RX)
|
|
|
|
|
+ next unless ref_match
|
|
|
|
|
|
|
|
-def parse_document_list(doc, base_url)
|
|
|
|
|
- # Look for clear “items”: pdf links, or list/table rows containing one
|
|
|
|
|
- anchors = doc.css("a").select { |a|
|
|
|
|
|
- href = a["href"].to_s
|
|
|
|
|
- a.text.to_s.strip.match?(/application|permit|advertis/i) || href.downcase.end_with?(".pdf")
|
|
|
|
|
- }
|
|
|
|
|
|
|
+ ref = ref_match[0]
|
|
|
|
|
+ rest = raw_text.sub(ref, “”).strip
|
|
|
|
|
|
|
|
- rows = []
|
|
|
|
|
- anchors.each do |a|
|
|
|
|
|
- href = a["href"].to_s
|
|
|
|
|
- pdf = abs_url(base_url, href)
|
|
|
|
|
- ctx = nearest_context_text(a)
|
|
|
|
|
- link_text = a.text.to_s.strip
|
|
|
|
|
-
|
|
|
|
|
- text_for_parse = [link_text, ctx].uniq.join(" — ")
|
|
|
|
|
-
|
|
|
|
|
- # Try to pull fields
|
|
|
|
|
- ref = extract_ref(text_for_parse)
|
|
|
|
|
- addr = if link_text.length > 6
|
|
|
|
|
- link_text
|
|
|
|
|
- else
|
|
|
|
|
- ctx[0, 140]
|
|
|
|
|
- end
|
|
|
|
|
-
|
|
|
|
|
- on_raw = extract_on_notice_raw(text_for_parse)
|
|
|
|
|
|
|
+ # Extract on-notice date: “(submissions by 21/04/2026)”
|
|
|
|
|
+ on_raw = rest[/\(submissions\s+by\s+([^)]+)\)/i, 1]&.strip || “”
|
|
|
on_dt = Util.parse_aus_date(on_raw)
|
|
on_dt = Util.parse_aus_date(on_raw)
|
|
|
|
|
|
|
|
- desc = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
|
|
|
|
|
- $1.strip
|
|
|
|
|
|
|
+ # Strip the on-notice clause and split “address - description”
|
|
|
|
|
+ body = rest.sub(/\s*\(submissions\s+by\s+[^)]+\)/i, “”).strip
|
|
|
|
|
+ if (m = body.match(/\A(.+?)\s+-\s+(.+)\z/))
|
|
|
|
|
+ address = m[1].strip
|
|
|
|
|
+ description = m[2].strip
|
|
|
else
|
|
else
|
|
|
- "Development Application"
|
|
|
|
|
|
|
+ address = body
|
|
|
|
|
+ description = “Development Application”
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
- next if ref.nil? || addr.to_s.strip.empty?
|
|
|
|
|
|
|
+ next if address.empty?
|
|
|
|
|
|
|
|
rows << {
|
|
rows << {
|
|
|
council_reference: ref,
|
|
council_reference: ref,
|
|
|
- address: addr.to_s.strip,
|
|
|
|
|
- description: desc,
|
|
|
|
|
- date_received: on_dt,
|
|
|
|
|
- date_received_raw: on_raw,
|
|
|
|
|
- document_url: pdf
|
|
|
|
|
|
|
+ address: address[0, 255],
|
|
|
|
|
+ description: description,
|
|
|
|
|
+ on_notice_to: on_dt,
|
|
|
|
|
+ on_notice_to_raw: on_raw,
|
|
|
|
|
+ document_url: abs_url(base_url, pdf_href)
|
|
|
}
|
|
}
|
|
|
end
|
|
end
|
|
|
|
|
|
|
@@ -144,32 +89,36 @@ if html.include?("Just a moment") || html.include?("Enable JavaScript and cookie
|
|
|
exit 0
|
|
exit 0
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-doc = Nokogiri::HTML(html)
|
|
|
|
|
-items = parse_document_list(doc, URL)
|
|
|
|
|
|
|
+doc = Nokogiri::HTML(html)
|
|
|
|
|
+items = parse_items(doc, URL)
|
|
|
|
|
|
|
|
puts "Found #{items.length} item(s) for #{TABLE}"
|
|
puts "Found #{items.length} item(s) for #{TABLE}"
|
|
|
|
|
|
|
|
|
|
+saved = 0
|
|
|
items.each do |r|
|
|
items.each do |r|
|
|
|
- DB.upsert(TABLE, {
|
|
|
|
|
- description: r[:description],
|
|
|
|
|
- date_received: r[:date_received],
|
|
|
|
|
- date_received_raw: r[:date_received_raw],
|
|
|
|
|
- on_notice_to: r[:date_received],
|
|
|
|
|
- on_notice_to_raw: r[:date_received_raw],
|
|
|
|
|
- address: r[:address],
|
|
|
|
|
- council_reference: r[:council_reference],
|
|
|
|
|
- document_url: r[:document_url],
|
|
|
|
|
- applicant: "",
|
|
|
|
|
- owner: ""
|
|
|
|
|
- })
|
|
|
|
|
-
|
|
|
|
|
- enrich_after_upsert!(
|
|
|
|
|
- table: TABLE,
|
|
|
|
|
- council_reference: r[:council_reference],
|
|
|
|
|
- address: r[:address]
|
|
|
|
|
- )
|
|
|
|
|
-
|
|
|
|
|
- puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
|
|
|
|
|
|
|
+ begin
|
|
|
|
|
+ DB.upsert(TABLE, {
|
|
|
|
|
+ description: r[:description],
|
|
|
|
|
+ on_notice_to: r[:on_notice_to],
|
|
|
|
|
+ on_notice_to_raw: r[:on_notice_to_raw],
|
|
|
|
|
+ address: r[:address],
|
|
|
|
|
+ council_reference: r[:council_reference],
|
|
|
|
|
+ document_url: r[:document_url],
|
|
|
|
|
+ applicant: "",
|
|
|
|
|
+ owner: ""
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ enrich_after_upsert!(
|
|
|
|
|
+ table: TABLE,
|
|
|
|
|
+ council_reference: r[:council_reference],
|
|
|
|
|
+ address: r[:address]
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ Log.info "kentish", "Upserted #{r[:council_reference]} -> #{r[:address]}"
|
|
|
|
|
+ saved += 1
|
|
|
|
|
+ rescue StandardError => e
|
|
|
|
|
+ Log.warn "kentish", "DB error for #{r[:council_reference]}: #{e.class} #{e.message}"
|
|
|
|
|
+ end
|
|
|
end
|
|
end
|
|
|
|
|
|
|
|
-puts "Done #{TABLE}."
|
|
|
|
|
|
|
+puts "Done #{TABLE}. Saved #{saved} item(s)."
|