dorset.rb 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. # Dorset Council — Advertised Development Applications
  2. #
  3. # Source: https://www.dorset.tas.gov.au/online-development-application-enquiry
  4. #
  5. # Page structure — each application is a <p><a href="PDF_URL">text</a></p>:
  6. #
  7. # PLA/2026/22: Residential dwelling and carport addition - Chris Triebe
  8. # and Associates Town Planning Services - 13 Gladstone Road
  9. # Herrick - Closes 18.04.2026
  10. #
  11. # Text format: REF: DESCRIPTION - APPLICANT - ADDRESS - Closes DD.MM.YYYY
  12. #
  13. # Note: the old eServices portal (eservices.dorset.tas.gov.au) is still live
  14. # and was the previous data source. The council now publishes the advertised
  15. # list on their main website with direct PDF links, which is simpler to scrape.
  16. require "date"
  17. require "nokogiri"
  18. require "uri"
  19. require "fileutils"
  20. require_relative "../lib/scraper_helpers"
  21. require_relative "../lib/util"
  22. require_relative "../lib/log"
  23. TABLE = ENV.fetch("TABLE_NAME")
  24. URL = "https://www.dorset.tas.gov.au/online-development-application-enquiry"
  25. DOWNLOAD_ATTACHMENTS = ENV["DOWNLOAD_ATTACHMENTS"] == "1"
  26. DOWNLOAD_DIR = ENV["DOWNLOAD_DIR"] || "/app/downloads"
  27. DB.ensure_table!(TABLE)
  28. REF_RX = /\bPLA\/\d{4}\/\d+\b/i
  29. CLOSE_RX = /\bCloses\s+(\d{1,2}[.\-]\d{1,2}[.\-]\d{4})\b/i
  30. def safe_name(s) = s.to_s.gsub(/[^\w\-.]+/, "_")
  31. def download_pdf(url, council_reference)
  32. return nil if url.to_s.strip.empty?
  33. dir = File.join(DOWNLOAD_DIR, "dorset", safe_name(council_reference))
  34. FileUtils.mkdir_p(dir)
  35. fname = safe_name(File.basename(URI.parse(url).path))
  36. fname = "document.pdf" if fname.empty?
  37. path = File.join(dir, fname)
  38. body = Http.get(url, headers: { "Accept" => "application/pdf,*/*", "Referer" => URL })
  39. File.binwrite(path, body)
  40. puts " saved #{fname} (#{body.bytesize} bytes)"
  41. "/files/dorset/#{safe_name(council_reference)}/#{fname}"
  42. rescue StandardError => e
  43. Log.warn "dorset", "Download failed for #{url}: #{e.class} #{e.message}"
  44. nil
  45. end
  46. html = Http.get(URL)
  47. doc = Nokogiri::HTML(html)
  48. items = []
  49. doc.css("p a[href]").each do |a|
  50. text = a.text.gsub(/[[:space:]]+/, " ").strip
  51. next unless (ref_m = text.match(REF_RX))
  52. ref = ref_m[0]
  53. # Strip "PLA/YYYY/NNN: " prefix
  54. remainder = text.sub(/\A#{Regexp.escape(ref)}:\s*/i, "")
  55. # Extract and strip closing date from the end
  56. close_raw = ""
  57. on_notice_to = nil
  58. if (close_m = remainder.match(CLOSE_RX))
  59. close_raw = close_m[1]
  60. on_notice_to = Date.strptime(close_raw, "%d.%m.%Y") rescue nil
  61. remainder = remainder.sub(/\s*-\s*#{Regexp.escape(close_m[0])}\s*\z/i, "").strip
  62. end
  63. # Remaining text: "Description - Applicant - Address"
  64. # Split on " - "; last part = address, second-to-last = applicant, rest = description
  65. parts = remainder.split(/\s+-\s+/)
  66. if parts.length >= 3
  67. address = parts.last.strip
  68. applicant = parts[-2].strip
  69. description = parts[0..-3].join(" - ").strip
  70. elsif parts.length == 2
  71. address = parts.last.strip
  72. applicant = ""
  73. description = parts.first.strip
  74. else
  75. address = remainder.strip
  76. applicant = ""
  77. description = "Development Application"
  78. end
  79. next if address.empty?
  80. description = "Development Application" if description.empty?
  81. pdf_url = abs_url(URL, a["href"].to_s.strip)
  82. items << {
  83. council_reference: ref,
  84. address: address,
  85. description: description,
  86. applicant: applicant,
  87. on_notice_to: on_notice_to,
  88. on_notice_to_raw: close_raw,
  89. document_url: pdf_url
  90. }
  91. end
  92. puts "Found #{items.length} item(s) for #{TABLE}"
  93. items.each do |r|
  94. local_url = DOWNLOAD_ATTACHMENTS ? download_pdf(r[:document_url], r[:council_reference]) : nil
  95. upsert_and_enrich!(
  96. table: TABLE,
  97. row: {
  98. council_reference: r[:council_reference],
  99. address: r[:address],
  100. description: r[:description],
  101. applicant: r[:applicant],
  102. on_notice_to: r[:on_notice_to],
  103. on_notice_to_raw: r[:on_notice_to_raw],
  104. owner: ""
  105. },
  106. extras: {
  107. document_url: r[:document_url],
  108. local_document_url: local_url
  109. }
  110. )
  111. end
  112. puts "Done #{TABLE}. Saved #{items.length} item(s)."