latrobe.rb 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. # Latrobe Council – PlanBuild "Currently Advertised" scraper
  2. require "nokogiri"
  3. require_relative "../lib/http"
  4. require_relative "../lib/db"
  5. require_relative "../lib/util"
  6. require_relative "../lib/enrich"
  7. TABLE = ENV.fetch("TABLE_NAME")
  8. URL = ENV.fetch("PLANBUILD_URL", "https://portal.planbuild.tas.gov.au/external/advertisement/search")
  9. COUNCIL_NAME = "Latrobe Council"
  10. # Safe reference matcher (slashes inside are fine with %r{...})
  11. REF_RX = %r{(Application|Reference)\s*(No\.?|Number)?:\s*([A-Za-z0-9\-._/]+)}i
  12. DB.ensure_table!(TABLE)
  13. def extract_text_between(text, label_regex, stop_regexes)
  14. if (m = text.match(label_regex))
  15. start = m.end(0)
  16. tail = text[start..-1]
  17. stop = stop_regexes.map { |r| (tail =~ r) }.compact.min
  18. stop ? tail[0...stop].strip : tail.strip
  19. end
  20. end
  21. html = Http.get(URL)
  22. doc = Nokogiri::HTML(html)
  23. blocks = doc.css(".advertisement-result, .panel.panel-default, .panel.panel-info, .result-row, .row")
  24. saved = 0
  25. blocks.each do |blk|
  26. text = blk.text.strip.gsub(/\s+/, " ")
  27. next unless text.match?(/Application|Reference|Council/i)
  28. address_el = blk.at_css(".address, [data-field='address'], .col-xs-8, .col-sm-8")
  29. ref_el = blk.at_css(".reference, [data-field='reference'], .col-xs-4, .col-sm-4")
  30. address = address_el&.text&.strip.to_s
  31. council_reference = ref_el&.text&.strip.to_s
  32. address = extract_text_between(text, /Address:\s*/i,
  33. [/Reference:/i, /Application/i, /Council:/i, /\z/]) if address.empty?
  34. if council_reference.empty?
  35. if (m = text.match(REF_RX))
  36. council_reference = m[3].strip
  37. end
  38. end
  39. council_name = if (m = text.match(/Council:\s*([A-Za-z \-]+Council)/i))
  40. m[1].strip
  41. end
  42. next unless council_name&.include?(COUNCIL_NAME)
  43. description = extract_text_between(
  44. text,
  45. /(Type of Work|Proposal|Description):\s*/i,
  46. [/Address:/i, /Application/i, /Reference/i, /Council:/i, /\z/]
  47. ) || ""
  48. date_received_raw =
  49. if (m = text.match(/(Date Lodged|Date Received|Lodged):\s*([0-9]{1,2}\/[0-9]{1,2}\/[0-9]{2,4})/i))
  50. m[2].strip
  51. else
  52. ""
  53. end
  54. date_received = Util.parse_aus_date(date_received_raw)
  55. next if address.empty? || council_reference.empty?
  56. DB.upsert(TABLE, {
  57. description: description,
  58. date_received: date_received,
  59. date_received_raw: date_received_raw,
  60. address: address,
  61. council_reference: council_reference,
  62. applicant: "",
  63. owner: ""
  64. })
  65. enrich_after_upsert!(
  66. table: TABLE,
  67. council_reference: council_reference,
  68. address: address
  69. )
  70. puts "Upserted #{council_reference} | #{address}"
  71. saved += 1
  72. end
  73. puts "Done #{TABLE}. Saved #{saved} item(s)."