northernmidlands.rb 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. # Northern Midlands Council — Advertised / Planning Applications (site page)
  2. require "nokogiri"
  3. require "uri"
  4. require "cgi"
  5. require_relative "../lib/http"
  6. require_relative "../lib/db"
  7. require_relative "../lib/util"
  8. require_relative "../lib/enrich"
  9. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_northernmidlands
  10. URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
  11. DB.ensure_table!(TABLE)
  12. # Optional extras
  13. begin
  14. DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS document_url TEXT NULL")
  15. DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to DATE NULL")
  16. DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS on_notice_to_raw VARCHAR(80) NULL")
  17. DB.client.query("ALTER TABLE `#{DB.client.escape(TABLE)}` ADD COLUMN IF NOT EXISTS title_reference TEXT NULL")
  18. rescue => e
  19. warn "Optional column add skipped: #{e.class} #{e.message}"
  20. end
  21. def abs_url(base, href)
  22. return "" if href.to_s.strip.empty?
  23. URI.join(base, href).to_s rescue href.to_s
  24. end
  25. # “DA 2025/00123”, “DA2025/00123”, “Application No. DA 2025/123”
  26. REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
  27. REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i
  28. def extract_ref(str)
  29. s = CGI.unescape(str.to_s)
  30. if (m = s.match(REF_RX1))
  31. return "DA #{m[1]} / #{m[2]}"
  32. end
  33. if (m = s.match(REF_RX2))
  34. return "DA #{m[1]} / #{m[2]}"
  35. end
  36. nil
  37. end
  38. DATE_RX = /
  39. (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
  40. \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
  41. \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
  42. /x
  43. def extract_on_notice_raw(text)
  44. s = text.to_s.gsub(/\s+/, " ")
  45. if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
  46. if (d = m[2].match(DATE_RX))
  47. return d[1]
  48. end
  49. end
  50. if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
  51. if (d = m[2].match(DATE_RX))
  52. return d[1]
  53. end
  54. end
  55. if (d = s.match(DATE_RX))
  56. return d[1]
  57. end
  58. ""
  59. end
  60. def nearest_context_text(a)
  61. host = a.ancestors("li, p, div, tr, article").first || a.parent
  62. host ? host.text.to_s.strip.gsub(/\s+/, " ") : ""
  63. end
  64. def parse_items(doc, base_url)
  65. rows = []
  66. # 1) Obvious list items or rows with PDFs or application keywords
  67. anchors = doc.css("a").select { |a|
  68. href = a["href"].to_s
  69. a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf")
  70. }
  71. anchors.each do |a|
  72. href = a["href"].to_s
  73. link_text = a.text.to_s.strip
  74. document_url = abs_url(base_url, href)
  75. ctx = nearest_context_text(a)
  76. # Title to keep, if present
  77. title_reference = link_text.empty? ? ctx[0,200] : link_text
  78. text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ")
  79. # Address: prefer the link text, else the surrounding text slice
  80. address = if link_text.length >= 6
  81. link_text
  82. else
  83. ctx[0, 140]
  84. end
  85. # Reference from text or file name
  86. ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url))
  87. # On-notice
  88. on_raw = extract_on_notice_raw(text_for_parse)
  89. on_dt = Util.parse_aus_date(on_raw)
  90. # Description
  91. description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
  92. $1.strip
  93. else
  94. "Development Application"
  95. end
  96. next if ref.nil? || address.to_s.strip.empty?
  97. rows << {
  98. council_reference: ref,
  99. address: address.to_s.strip,
  100. description: description,
  101. date_received: on_dt,
  102. date_received_raw: on_raw,
  103. document_url: document_url,
  104. title_reference: title_reference
  105. }
  106. end
  107. # 2) If the page uses a two-column details table, pick that up too
  108. doc.css("table").each do |t|
  109. heads = t.css("th").map { |th| th.text.strip.downcase }
  110. next unless heads.any? { |h| h.match?(/application|reference|proposal|address|notice|closing/) }
  111. t.css("tr").each do |tr|
  112. cells = tr.css("td")
  113. next unless cells.length >= 2
  114. row_text = tr.text.to_s.strip.gsub(/\s+/, " ")
  115. ref = extract_ref(row_text)
  116. addr = row_text[/address[:\s]+(.+?)(?:\s{2,}|$)/i, 1] || row_text[0, 140]
  117. on_raw = extract_on_notice_raw(row_text)
  118. on_dt = Util.parse_aus_date(on_raw)
  119. next if ref.nil? || addr.to_s.strip.empty?
  120. rows << {
  121. council_reference: ref,
  122. address: addr.to_s.strip,
  123. description: "Development Application",
  124. date_received: on_dt,
  125. date_received_raw: on_raw,
  126. document_url: "",
  127. title_reference: row_text[0,200]
  128. }
  129. end
  130. end
  131. rows.uniq { |r| [r[:council_reference], r[:address]] }
  132. end
  133. if URL.empty?
  134. warn "NORTHERN_MIDLANDS_URL is not set. Example:\n ONLY=northernmidlands NORTHERN_MIDLANDS_URL='https://.../advertised-applications' docker compose run --rm scraper /app/run_all.sh"
  135. exit 0
  136. end
  137. begin
  138. html = if URL.include?("/eservice/")
  139. # Some councils use ePathway, which needs a cookie-warmed session
  140. Http.dorset_session_get(URL)
  141. else
  142. Http.get(URL)
  143. end
  144. rescue => e
  145. warn "Failed to fetch #{URL}: #{e.class} #{e.message}"
  146. exit 1
  147. end
  148. doc = Nokogiri::HTML(html)
  149. items = parse_items(doc, URL)
  150. puts "Found #{items.length} item(s) for #{TABLE}"
  151. items.each do |r|
  152. DB.upsert(TABLE, {
  153. description: r[:description],
  154. date_received: r[:date_received],
  155. date_received_raw: r[:date_received_raw],
  156. address: r[:address],
  157. council_reference: r[:council_reference],
  158. applicant: "",
  159. owner: ""
  160. })
  161. enrich_after_upsert!(
  162. table: TABLE,
  163. council_reference: council_reference,
  164. address: address
  165. )
  166. begin
  167. upd = DB.client.prepare("UPDATE `#{DB.client.escape(TABLE)}` SET document_url = ?, on_notice_to = ?, on_notice_to_raw = ?, title_reference = ? WHERE council_reference = ? AND address = ?")
  168. upd.execute(r[:document_url], r[:date_received], r[:date_received_raw], r[:title_reference], r[:council_reference], r[:address])
  169. rescue => e
  170. warn "Extras update skipped for #{r[:council_reference]}: #{e.class} #{e.message}"
  171. end
  172. puts "Upserted #{r[:council_reference]} -> #{r[:address]}"
  173. end
  174. puts "Done #{TABLE}."