northernmidlands.rb 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. # Northern Midlands Council — Advertised / Planning Applications (site page)
  2. require "nokogiri"
  3. require "uri"
  4. require "cgi"
  5. require_relative "../lib/scraper_helpers"
  6. require_relative "../lib/util"
  7. require_relative "../lib/log"
  8. TABLE = ENV.fetch("TABLE_NAME") # run_all.sh -> da_northernmidlands
  9. URL = "https://northernmidlands.tas.gov.au/planning/development-in-the-northern-midlands/development-applications-2"
  10. DB.ensure_table!(TABLE)
  11. # “DA 2025/00123”, “DA2025/00123”, “Application No. DA 2025/123”
  12. REF_RX1 = %r{\bDA\s*(20\d{2})\s*/\s*([A-Za-z0-9\-_.]+)}i
  13. REF_RX2 = %r{\bDA(20\d{2})\s*[-\/]?\s*([0-9]{3,})\b}i
  14. def extract_ref(str)
  15. s = CGI.unescape(str.to_s)
  16. if (m = s.match(REF_RX1))
  17. return "DA #{m[1]} / #{m[2]}"
  18. end
  19. if (m = s.match(REF_RX2))
  20. return "DA #{m[1]} / #{m[2]}"
  21. end
  22. nil
  23. end
  24. DATE_RX = /
  25. (\b\d{1,2}\/\d{1,2}\/\d{2,4}\b|
  26. \b\d{1,2}\s+[A-Za-z]{3,}\s+\d{4}\b|
  27. \b[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}\b)
  28. /x
  29. def extract_on_notice_raw(text)
  30. s = text.to_s.gsub(/\s+/, " ")
  31. if (m = s.match(/\bon\s*notice\s*(until|to)\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
  32. if (d = m[2].match(DATE_RX))
  33. return d[1]
  34. end
  35. end
  36. if (m = s.match(/clos(?:e|ing|es)\s*(on)?\s*[:\-]?\s*([A-Za-z0-9\/ ,]+)/i))
  37. if (d = m[2].match(DATE_RX))
  38. return d[1]
  39. end
  40. end
  41. if (d = s.match(DATE_RX))
  42. return d[1]
  43. end
  44. ""
  45. end
  46. def nearest_context_text(a)
  47. host = a.ancestors("li, p, div, tr, article").first || a.parent
  48. host ? host.text.to_s.strip.gsub(/\s+/, " ") : ""
  49. end
  50. def parse_items(doc, base_url)
  51. rows = []
  52. # 1) Obvious list items or rows with PDFs or application keywords
  53. anchors = doc.css("a").select { |a|
  54. href = a["href"].to_s
  55. a.text.to_s.strip.match?(/application|permit|planning|advertis/i) || href.downcase.end_with?(".pdf")
  56. }
  57. anchors.each do |a|
  58. href = a["href"].to_s
  59. link_text = a.text.to_s.strip
  60. document_url = abs_url(base_url, href)
  61. ctx = nearest_context_text(a)
  62. # Title to keep, if present
  63. title_reference = link_text.empty? ? ctx[0,200] : link_text
  64. text_for_parse = [link_text, ctx].reject(&:empty?).uniq.join(" — ")
  65. # Address: prefer the link text, else the surrounding text slice
  66. address = if link_text.length >= 6
  67. link_text
  68. else
  69. ctx[0, 140]
  70. end
  71. # Reference from text or file name
  72. ref = extract_ref(text_for_parse) || extract_ref(File.basename(document_url))
  73. # On-notice
  74. on_raw = extract_on_notice_raw(text_for_parse)
  75. on_dt = Util.parse_aus_date(on_raw)
  76. # Description
  77. description = if text_for_parse =~ /proposal\s*[:\-]\s*([^—\-]+)\b/i
  78. $1.strip
  79. else
  80. "Development Application"
  81. end
  82. next if ref.nil? || address.to_s.strip.empty?
  83. rows << {
  84. council_reference: ref,
  85. address: address.to_s.strip,
  86. description: description,
  87. date_received: on_dt,
  88. date_received_raw: on_raw,
  89. document_url: document_url,
  90. title_reference: title_reference
  91. }
  92. end
  93. # 2) If the page uses a two-column details table, pick that up too
  94. doc.css("table").each do |t|
  95. heads = t.css("th").map { |th| th.text.strip.downcase }
  96. next unless heads.any? { |h| h.match?(/application|reference|proposal|address|notice|closing/) }
  97. t.css("tr").each do |tr|
  98. cells = tr.css("td")
  99. next unless cells.length >= 2
  100. row_text = tr.text.to_s.strip.gsub(/\s+/, " ")
  101. ref = extract_ref(row_text)
  102. addr = row_text[/address[:\s]+(.+?)(?:\s{2,}|$)/i, 1] || row_text[0, 140]
  103. on_raw = extract_on_notice_raw(row_text)
  104. on_dt = Util.parse_aus_date(on_raw)
  105. next if ref.nil? || addr.to_s.strip.empty?
  106. rows << {
  107. council_reference: ref,
  108. address: addr.to_s.strip,
  109. description: "Development Application",
  110. date_received: on_dt,
  111. date_received_raw: on_raw,
  112. document_url: "",
  113. title_reference: row_text[0,200]
  114. }
  115. end
  116. end
  117. rows.uniq { |r| [r[:council_reference], r[:address]] }
  118. end
  119. if URL.empty?
  120. Log.warn "scraper", "NORTHERN_MIDLANDS_URL is not set. Example:\n ONLY=northernmidlands NORTHERN_MIDLANDS_URL='https://.../advertised-applications' docker compose run --rm scraper /app/run_all.sh"
  121. exit 0
  122. end
  123. begin
  124. html = if URL.include?("/eservice/")
  125. # Some councils use ePathway, which needs a cookie-warmed session
  126. Http.dorset_session_get(URL)
  127. else
  128. Http.get(URL)
  129. end
  130. rescue StandardError => e
  131. Log.warn "scraper", "Failed to fetch #{URL}: #{e.class} #{e.message}"
  132. exit 1
  133. end
  134. doc = Nokogiri::HTML(html)
  135. items = parse_items(doc, URL)
  136. puts "Found #{items.length} item(s) for #{TABLE}"
  137. items.each do |r|
  138. upsert_and_enrich!(
  139. table: TABLE,
  140. row: {
  141. description: r[:description],
  142. date_received: r[:date_received],
  143. date_received_raw: r[:date_received_raw],
  144. address: r[:address],
  145. council_reference: r[:council_reference],
  146. applicant: "",
  147. owner: ""
  148. },
  149. extras: {
  150. document_url: r[:document_url],
  151. on_notice_to: r[:date_received],
  152. on_notice_to_raw: r[:date_received_raw],
  153. title_reference: r[:title_reference]
  154. }
  155. )
  156. end
  157. puts "Done #{TABLE}."