rssfetch.class.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. <?php
  2. /*
  3. * Project: MagpieRSS: a simple RSS integration tool
  4. * File: rss_fetch.inc, a simple functional interface
  5. to fetching and parsing RSS files, via the
  6. function fetch_rss()
  7. * Author: Kellan Elliott-McCrea <kellan@protest.net>
  8. * License: GPL
  9. *
  10. * The lastest version of MagpieRSS can be obtained from:
  11. * http://magpierss.sourceforge.net
  12. *
  13. * For questions, help, comments, discussion, etc., please join the
  14. * Magpie mailing list:
  15. * magpierss-general@lists.sourceforge.net
  16. *
  17. */
  18. // Setup MAGPIE_DIR for use on hosts that don't include
  19. // the current path in include_path.
  20. // with thanks to rajiv and smarty
  21. if (!defined('DIR_SEP')) {
  22. define('DIR_SEP', DIRECTORY_SEPARATOR);
  23. }
  24. if (!defined('MAGPIE_DIR')) {
  25. define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP);
  26. }
  27. require_once MAGPIE_DIR . 'magpierss.class.php';
  28. require_once MAGPIE_DIR . 'rsscache.class.php';
  29. // for including 3rd party libraries
  30. define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP);
  31. require_once MAGPIE_EXTLIB . 'snoopy.class.php';
  32. /*
  33. * CONSTANTS - redefine these in your script to change the
  34. * behaviour of fetch_rss() currently, most options effect the cache
  35. *
  36. * MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects?
  37. * For me a built in cache was essential to creating a "PHP-like"
  38. * feel to Magpie, see rss_cache.inc for rationale
  39. *
  40. *
  41. * MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects?
  42. * This should be a location that the webserver can write to. If this
  43. * directory does not already exist Mapie will try to be smart and create
  44. * it. This will often fail for permissions reasons.
  45. *
  46. *
  47. * MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds.
  48. *
  49. *
  50. * MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error
  51. * instead of returning stale object?
  52. *
  53. * MAGPIE_DEBUG - Display debugging notices?
  54. *
  55. */
  56. /*=======================================================================*\
  57. Function: fetch_rss:
  58. Purpose: return RSS object for the give url
  59. maintain the cache
  60. Input: url of RSS file
  61. Output: parsed RSS object (see rss_parse.inc)
  62. NOTES ON CACHEING:
  63. If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache.
  64. NOTES ON RETRIEVING REMOTE FILES:
  65. If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will
  66. return a cached object, and touch the cache object upon recieving a
  67. 304.
  68. NOTES ON FAILED REQUESTS:
  69. If there is an HTTP error while fetching an RSS object, the cached
  70. version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off)
  71. \*=======================================================================*/
  72. define('MAGPIE_VERSION', '0.72');
  73. $MAGPIE_ERROR = "";
  74. function fetch_rss ($url) {
  75. // initialize constants
  76. init();
  77. if ( !isset($url) ) {
  78. error("fetch_rss called without a url");
  79. return false;
  80. }
  81. // if cache is disabled
  82. if ( !MAGPIE_CACHE_ON ) {
  83. // fetch file, and parse it
  84. $resp = _fetch_remote_file( $url );
  85. if ( is_success( $resp->status ) ) {
  86. return _response_to_rss( $resp );
  87. }
  88. else {
  89. error("Failed to fetch $url and cache is off");
  90. return false;
  91. }
  92. }
  93. // else cache is ON
  94. else {
  95. // Flow
  96. // 1. check cache
  97. // 2. if there is a hit, make sure its fresh
  98. // 3. if cached obj fails freshness check, fetch remote
  99. // 4. if remote fails, return stale object, or error
  100. $cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE );
  101. if (MAGPIE_DEBUG and $cache->ERROR) {
  102. debug($cache->ERROR, E_USER_WARNING);
  103. }
  104. $cache_status = 0; // response of check_cache
  105. $request_headers = array(); // HTTP headers to send with fetch
  106. $rss = 0; // parsed RSS object
  107. $errormsg = 0; // errors, if any
  108. // store parsed XML by desired output encoding
  109. // as character munging happens at parse time
  110. $cache_key = $url . MAGPIE_OUTPUT_ENCODING;
  111. if (!$cache->ERROR) {
  112. // return cache HIT, MISS, or STALE
  113. $cache_status = $cache->check_cache( $cache_key);
  114. }
  115. // if object cached, and cache is fresh, return cached obj
  116. if ( $cache_status == 'HIT' ) {
  117. $rss = $cache->get( $cache_key );
  118. if ( isset($rss) and $rss ) {
  119. // should be cache age
  120. $rss->from_cache = 1;
  121. if ( MAGPIE_DEBUG > 1) {
  122. debug("MagpieRSS: Cache HIT", E_USER_NOTICE);
  123. }
  124. return $rss;
  125. }
  126. }
  127. // else attempt a conditional get
  128. // setup headers
  129. if ( $cache_status == 'STALE' ) {
  130. $rss = $cache->get( $cache_key );
  131. if ( $rss and isset($rss->etag) and !empty($rss->etag) and isset($rss->last_modified) and !empty($rss->last_modified) ) {
  132. $request_headers['If-None-Match'] = $rss->etag;
  133. $request_headers['If-Last-Modified'] = $rss->last_modified;
  134. }
  135. }
  136. $resp = _fetch_remote_file( $url, $request_headers );
  137. if ($resp) {
  138. if ($resp->status == '304' ) {
  139. // we have the most current copy
  140. if ( MAGPIE_DEBUG > 1) {
  141. debug("Got 304 for $url");
  142. }
  143. // reset cache on 304 (at minutillo insistent prodding)
  144. $cache->set($cache_key, $rss);
  145. return $rss;
  146. }
  147. elseif ( is_success( $resp->status ) ) {
  148. $rss = _response_to_rss( $resp );
  149. if ( $rss ) {
  150. if (MAGPIE_DEBUG > 1) {
  151. debug("Fetch successful");
  152. }
  153. // add object to cache
  154. $cache->set( $cache_key, $rss );
  155. return $rss;
  156. }
  157. }
  158. else {
  159. $errormsg = "Failed to fetch $url ";
  160. if ( $resp->status == '-100' ) {
  161. $errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)";
  162. }
  163. elseif ( $resp->status == '0' ) {
  164. // you sir, are offline
  165. return false;
  166. }
  167. elseif ( $resp->error ) {
  168. # compensate for Snoopy's annoying habbit to tacking
  169. # on '\n'
  170. $http_error = substr($resp->error, 0, -2);
  171. $errormsg .= "(HTTP Error: $http_error)";
  172. }
  173. else {
  174. $errormsg .= "(HTTP Response: " . $resp->response_code .')';
  175. }
  176. }
  177. }
  178. else {
  179. $errormsg = "Unable to retrieve RSS file for unknown reasons.";
  180. }
  181. // else fetch failed
  182. // attempt to return cached object
  183. if ($rss) {
  184. if ( MAGPIE_DEBUG ) {
  185. debug("Returning STALE object for $url");
  186. }
  187. return $rss;
  188. }
  189. // else we totally failed
  190. error( $errormsg );
  191. return false;
  192. } // end if ( !MAGPIE_CACHE_ON ) {
  193. } // end fetch_rss()
  194. /*=======================================================================*\
  195. Function: error
  196. Purpose: set MAGPIE_ERROR, and trigger error
  197. \*=======================================================================*/
  198. function error ($errormsg, $lvl=E_USER_WARNING) {
  199. global $MAGPIE_ERROR;
  200. // append PHP's error message if track_errors enabled
  201. if ( isset($php_errormsg) ) {
  202. $errormsg .= " ($php_errormsg)";
  203. }
  204. if ( $errormsg ) {
  205. $errormsg = "MagpieRSS: $errormsg";
  206. $MAGPIE_ERROR = $errormsg;
  207. trigger_error( $errormsg, $lvl);
  208. }
  209. }
  210. function debug ($debugmsg, $lvl=E_USER_NOTICE) {
  211. trigger_error("MagpieRSS [debug] $debugmsg", $lvl);
  212. }
  213. /*=======================================================================*\
  214. Function: magpie_error
  215. Purpose: accessor for the magpie error variable
  216. \*=======================================================================*/
  217. function magpie_error ($errormsg="") {
  218. global $MAGPIE_ERROR;
  219. if ( isset($errormsg) and $errormsg ) {
  220. $MAGPIE_ERROR = $errormsg;
  221. }
  222. return $MAGPIE_ERROR;
  223. }
  224. /*=======================================================================*\
  225. Function: _fetch_remote_file
  226. Purpose: retrieve an arbitrary remote file
  227. Input: url of the remote file
  228. headers to send along with the request (optional)
  229. Output: an HTTP response object (see Snoopy.class.inc)
  230. \*=======================================================================*/
  231. function _fetch_remote_file ($url, $headers = "" ) {
  232. // Snoopy is an HTTP client in PHP
  233. $client = new Snoopy();
  234. $client->agent = MAGPIE_USER_AGENT;
  235. $client->read_timeout = MAGPIE_FETCH_TIME_OUT;
  236. $client->use_gzip = MAGPIE_USE_GZIP;
  237. if (is_array($headers) ) {
  238. $client->rawheaders = $headers;
  239. }
  240. @$client->fetch($url);
  241. return $client;
  242. }
  243. /*=======================================================================*\
  244. Function: _response_to_rss
  245. Purpose: parse an HTTP response object into an RSS object
  246. Input: an HTTP response object (see Snoopy)
  247. Output: parsed RSS object (see rss_parse)
  248. \*=======================================================================*/
  249. function _response_to_rss ($resp) {
  250. $rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING );
  251. // if RSS parsed successfully
  252. if ( $rss and !$rss->ERROR) {
  253. // find Etag, and Last-Modified
  254. foreach($resp->headers as $h) {
  255. // 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1"
  256. if (strpos($h, ": ")) {
  257. list($field, $val) = explode(": ", $h, 2);
  258. }
  259. else {
  260. $field = $h;
  261. $val = "";
  262. }
  263. if ( $field == 'ETag' ) {
  264. $rss->etag = $val;
  265. }
  266. if ( $field == 'Last-Modified' ) {
  267. $rss->last_modified = $val;
  268. }
  269. }
  270. return $rss;
  271. } // else construct error message
  272. else {
  273. $errormsg = "Failed to parse RSS file.";
  274. if ($rss) {
  275. $errormsg .= " (" . $rss->ERROR . ")";
  276. }
  277. error($errormsg);
  278. return false;
  279. } // end if ($rss and !$rss->error)
  280. }
  281. /*=======================================================================*\
  282. Function: init
  283. Purpose: setup constants with default values
  284. check for user overrides
  285. \*=======================================================================*/
  286. function init () {
  287. if ( defined('MAGPIE_INITALIZED') ) {
  288. return;
  289. }
  290. else {
  291. define('MAGPIE_INITALIZED', true);
  292. }
  293. if ( !defined('MAGPIE_CACHE_ON') ) {
  294. define('MAGPIE_CACHE_ON', true);
  295. }
  296. if ( !defined('MAGPIE_CACHE_DIR') ) {
  297. define('MAGPIE_CACHE_DIR', './cache');
  298. }
  299. if ( !defined('MAGPIE_CACHE_AGE') ) {
  300. define('MAGPIE_CACHE_AGE', 60*60); // one hour
  301. }
  302. if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) {
  303. define('MAGPIE_CACHE_FRESH_ONLY', false);
  304. }
  305. if ( !defined('MAGPIE_OUTPUT_ENCODING') ) {
  306. define('MAGPIE_OUTPUT_ENCODING', 'UTF-8');
  307. }
  308. if ( !defined('MAGPIE_INPUT_ENCODING') ) {
  309. define('MAGPIE_INPUT_ENCODING', null);
  310. }
  311. if ( !defined('MAGPIE_DETECT_ENCODING') ) {
  312. define('MAGPIE_DETECT_ENCODING', true);
  313. }
  314. if ( !defined('MAGPIE_DEBUG') ) {
  315. define('MAGPIE_DEBUG', 0);
  316. }
  317. if ( !defined('MAGPIE_USER_AGENT') ) {
  318. $ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net';
  319. if ( MAGPIE_CACHE_ON ) {
  320. $ua = $ua . ')';
  321. }
  322. else {
  323. $ua = $ua . '; No cache)';
  324. }
  325. define('MAGPIE_USER_AGENT', $ua);
  326. }
  327. if ( !defined('MAGPIE_FETCH_TIME_OUT') ) {
  328. define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout
  329. }
  330. // use gzip encoding to fetch rss files if supported?
  331. if ( !defined('MAGPIE_USE_GZIP') ) {
  332. define('MAGPIE_USE_GZIP', true);
  333. }
  334. }
  335. // NOTE: the following code should really be in Snoopy, or at least
  336. // somewhere other then rss_fetch!
  337. /*=======================================================================*\
  338. HTTP STATUS CODE PREDICATES
  339. These functions attempt to classify an HTTP status code
  340. based on RFC 2616 and RFC 2518.
  341. All of them take an HTTP status code as input, and return true or false
  342. All this code is adapted from LWP's HTTP::Status.
  343. \*=======================================================================*/
  344. /*=======================================================================*\
  345. Function: is_info
  346. Purpose: return true if Informational status code
  347. \*=======================================================================*/
  348. function is_info ($sc) {
  349. return $sc >= 100 && $sc < 200;
  350. }
  351. /*=======================================================================*\
  352. Function: is_success
  353. Purpose: return true if Successful status code
  354. \*=======================================================================*/
  355. function is_success ($sc) {
  356. return $sc >= 200 && $sc < 300;
  357. }
  358. /*=======================================================================*\
  359. Function: is_redirect
  360. Purpose: return true if Redirection status code
  361. \*=======================================================================*/
  362. function is_redirect ($sc) {
  363. return $sc >= 300 && $sc < 400;
  364. }
  365. /*=======================================================================*\
  366. Function: is_error
  367. Purpose: return true if Error status code
  368. \*=======================================================================*/
  369. function is_error ($sc) {
  370. return $sc >= 400 && $sc < 600;
  371. }
  372. /*=======================================================================*\
  373. Function: is_client_error
  374. Purpose: return true if Error status code, and its a client error
  375. \*=======================================================================*/
  376. function is_client_error ($sc) {
  377. return $sc >= 400 && $sc < 500;
  378. }
  379. /*=======================================================================*\
  380. Function: is_client_error
  381. Purpose: return true if Error status code, and its a server error
  382. \*=======================================================================*/
  383. function is_server_error ($sc) {
  384. return $sc >= 500 && $sc < 600;
  385. }