magpierss.class.php 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603
  1. <?php
  2. /**
  3. * Project: MagpieRSS: a simple RSS integration tool
  4. * File: rss_parse.inc - parse an RSS or Atom feed
  5. * return as a simple object.
  6. *
  7. * Handles RSS 0.9x, RSS 2.0, RSS 1.0, and Atom 0.3
  8. *
  9. * The lastest version of MagpieRSS can be obtained from:
  10. * http://magpierss.sourceforge.net
  11. *
  12. * For questions, help, comments, discussion, etc., please join the
  13. * Magpie mailing list:
  14. * magpierss-general@lists.sourceforge.net
  15. *
  16. * @author Kellan Elliott-McCrea <kellan@protest.net>
  17. * @version 0.7a
  18. * @license GPL
  19. *
  20. */
  21. define('RSS', 'RSS');
  22. define('ATOM', 'Atom');
  23. require_once MAGPIE_DIR . 'rss_utils.inc';
  24. /**
  25. * Hybrid parser, and object, takes RSS as a string and returns a simple object.
  26. *
  27. * see: magpierss.class.php for a simpler interface with integrated caching support
  28. *
  29. */
  30. class MagpieRSS {
  31. public $parser;
  32. public $current_item = array(); // item currently being parsed
  33. public $items = array(); // collection of parsed items
  34. public $channel = array(); // hash of channel fields
  35. public $textinput = array();
  36. public $image = array();
  37. public $feed_type;
  38. public $feed_version;
  39. public $encoding = ''; // output encoding of parsed rss
  40. public $_source_encoding = ''; // only set if we have to parse xml prolog
  41. public $ERROR = "";
  42. public $WARNING = "";
  43. // define some constants
  44. public $_CONTENT_CONSTRUCTS = array('content', 'summary', 'info', 'title', 'tagline', 'copyright');
  45. public $_KNOWN_ENCODINGS = array('UTF-8', 'US-ASCII', 'ISO-8859-1');
  46. // parser variables, useless if you're not a parser, treat as private
  47. public $stack = array(); // parser stack
  48. public $inchannel = false;
  49. public $initem = false;
  50. public $incontent = false; // if in Atom <content mode="xml"> field
  51. public $intextinput = false;
  52. public $inimage = false;
  53. public $current_namespace = false;
  54. /**
  55. * Set up XML parser, parse source, and return populated RSS object..
  56. *
  57. * @param string $source string containing the RSS to be parsed
  58. *
  59. * NOTE: Probably a good idea to leave the encoding options alone unless
  60. * you know what you're doing as PHP's character set support is
  61. * a little weird.
  62. *
  63. * NOTE: A lot of this is unnecessary but harmless with PHP5
  64. *
  65. *
  66. * @param string $output_encoding output the parsed RSS in this character
  67. * set defaults to ISO-8859-1 as this is PHP's
  68. * default.
  69. *
  70. * NOTE: might be changed to UTF-8 in future
  71. * versions.
  72. *
  73. * @param string $input_encoding the character set of the incoming RSS source.
  74. * Leave blank and Magpie will try to figure it
  75. * out.
  76. *
  77. *
  78. * @param bool $detect_encoding if false Magpie won't attempt to detect
  79. * source encoding. (caveat emptor)
  80. *
  81. */
  82. function __construct($source, $output_encoding='ISO-8859-1',$input_encoding=null, $detect_encoding=true) {
  83. # if PHP xml isn't compiled in, die
  84. #
  85. if (!function_exists('xml_parser_create')) {
  86. $this->error( "Failed to load PHP's XML Extension. " .
  87. "http://www.php.net/manual/en/ref.xml.php",
  88. E_USER_ERROR );
  89. }
  90. list($parser, $source) = $this->create_parser($source,
  91. $output_encoding, $input_encoding, $detect_encoding);
  92. if (!is_resource($parser)) {
  93. $this->error( "Failed to create an instance of PHP's XML parser. " .
  94. "http://www.php.net/manual/en/ref.xml.php",
  95. E_USER_ERROR );
  96. }
  97. $this->parser = $parser;
  98. # pass in parser, and a reference to this object
  99. # setup handlers
  100. #
  101. xml_set_object( $this->parser, $this );
  102. xml_set_element_handler($this->parser,
  103. 'feed_start_element', 'feed_end_element' );
  104. xml_set_character_data_handler( $this->parser, 'feed_cdata' );
  105. $status = xml_parse( $this->parser, $source );
  106. if (! $status ) {
  107. $errorcode = xml_get_error_code( $this->parser );
  108. if ( $errorcode != XML_ERROR_NONE ) {
  109. $xml_error = xml_error_string( $errorcode );
  110. $error_line = xml_get_current_line_number($this->parser);
  111. $error_col = xml_get_current_column_number($this->parser);
  112. $errormsg = "$xml_error at line $error_line, column $error_col";
  113. $this->error( $errormsg );
  114. }
  115. }
  116. xml_parser_free( $this->parser );
  117. $this->normalize();
  118. }
  119. function feed_start_element($p, $element, &$attrs) {
  120. $el = $element = strtolower($element);
  121. $attrs = array_change_key_case($attrs, CASE_LOWER);
  122. // check for a namespace, and split if found
  123. $ns = false;
  124. if ( strpos( $element, ':' ) ) {
  125. list($ns, $el) = explode( ':', $element, 2);
  126. }
  127. if ( $ns and $ns != 'rdf' ) {
  128. $this->current_namespace = $ns;
  129. }
  130. # if feed type isn't set, then this is first element of feed
  131. # identify feed from root element
  132. #
  133. if (!isset($this->feed_type) ) {
  134. if ( $el == 'rdf' ) {
  135. $this->feed_type = RSS;
  136. $this->feed_version = '1.0';
  137. }
  138. elseif ( $el == 'rss' ) {
  139. $this->feed_type = RSS;
  140. $this->feed_version = $attrs['version'];
  141. }
  142. elseif ( $el == 'feed' ) {
  143. $this->feed_type = ATOM;
  144. $this->feed_version = $attrs['version'];
  145. $this->inchannel = true;
  146. }
  147. return;
  148. }
  149. if ( $el == 'channel' )
  150. {
  151. $this->inchannel = true;
  152. }
  153. elseif ($el == 'item' or $el == 'entry' )
  154. {
  155. $this->initem = true;
  156. if ( isset($attrs['rdf:about']) ) {
  157. $this->current_item['about'] = $attrs['rdf:about'];
  158. }
  159. }
  160. // if we're in the default namespace of an RSS feed,
  161. // record textinput or image fields
  162. elseif (
  163. $this->feed_type == RSS and
  164. $this->current_namespace == '' and
  165. $el == 'textinput' )
  166. {
  167. $this->intextinput = true;
  168. }
  169. elseif (
  170. $this->feed_type == RSS and
  171. $this->current_namespace == '' and
  172. $el == 'image' )
  173. {
  174. $this->inimage = true;
  175. }
  176. # handle atom content constructs
  177. elseif ( $this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  178. {
  179. // avoid clashing w/ RSS mod_content
  180. if ($el == 'content' ) {
  181. $el = 'atom_content';
  182. }
  183. $this->incontent = $el;
  184. }
  185. // if inside an Atom content construct (e.g. content or summary) field treat tags as text
  186. elseif ($this->feed_type == ATOM and $this->incontent )
  187. {
  188. // if tags are inlined, then flatten
  189. $attrs_str = join(' ',
  190. array_map('map_attrs',
  191. array_keys($attrs),
  192. array_values($attrs) ) );
  193. $this->append_content( "<$element $attrs_str>" );
  194. array_unshift( $this->stack, $el );
  195. }
  196. // Atom support many links per containging element.
  197. // Magpie treats link elements of type rel='alternate'
  198. // as being equivalent to RSS's simple link element.
  199. //
  200. elseif ($this->feed_type == ATOM and $el == 'link' )
  201. {
  202. if ( isset($attrs['rel']) and $attrs['rel'] == 'alternate' )
  203. {
  204. $link_el = 'link';
  205. }
  206. else {
  207. $link_el = 'link_' . $attrs['rel'];
  208. }
  209. $this->append($link_el, $attrs['href']);
  210. }
  211. // set stack[0] to current element
  212. else {
  213. array_unshift($this->stack, $el);
  214. }
  215. }
  216. function feed_cdata ($p, $text) {
  217. if ($this->feed_type == ATOM and $this->incontent)
  218. {
  219. $this->append_content( $text );
  220. }
  221. else {
  222. $current_el = join('_', array_reverse($this->stack));
  223. $this->append($current_el, $text);
  224. }
  225. }
  226. function feed_end_element ($p, $el) {
  227. $el = strtolower($el);
  228. if ( $el == 'item' or $el == 'entry' )
  229. {
  230. $this->items[] = $this->current_item;
  231. $this->current_item = array();
  232. $this->initem = false;
  233. }
  234. elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'textinput' )
  235. {
  236. $this->intextinput = false;
  237. }
  238. elseif ($this->feed_type == RSS and $this->current_namespace == '' and $el == 'image' )
  239. {
  240. $this->inimage = false;
  241. }
  242. elseif ($this->feed_type == ATOM and in_array($el, $this->_CONTENT_CONSTRUCTS) )
  243. {
  244. $this->incontent = false;
  245. }
  246. elseif ($el == 'channel' or $el == 'feed' )
  247. {
  248. $this->inchannel = false;
  249. }
  250. elseif ($this->feed_type == ATOM and $this->incontent ) {
  251. // balance tags properly
  252. // note: i don't think this is actually neccessary
  253. if ( $this->stack[0] == $el )
  254. {
  255. $this->append_content("</$el>");
  256. }
  257. else {
  258. $this->append_content("<$el />");
  259. }
  260. array_shift( $this->stack );
  261. }
  262. else {
  263. array_shift( $this->stack );
  264. }
  265. $this->current_namespace = false;
  266. }
  267. function concat (&$str1, $str2="") {
  268. if (!isset($str1) ) {
  269. $str1="";
  270. }
  271. $str1 .= $str2;
  272. }
  273. function append_content($text) {
  274. if ( $this->initem ) {
  275. $this->concat( $this->current_item[ $this->incontent ], $text );
  276. }
  277. elseif ( $this->inchannel ) {
  278. $this->concat( $this->channel[ $this->incontent ], $text );
  279. }
  280. }
  281. // smart append - field and namespace aware
  282. function append($el, $text) {
  283. if (!$el) {
  284. return;
  285. }
  286. if ( $this->current_namespace )
  287. {
  288. if ( $this->initem ) {
  289. $this->concat(
  290. $this->current_item[ $this->current_namespace ][ $el ], $text);
  291. }
  292. elseif ($this->inchannel) {
  293. $this->concat(
  294. $this->channel[ $this->current_namespace][ $el ], $text );
  295. }
  296. elseif ($this->intextinput) {
  297. $this->concat(
  298. $this->textinput[ $this->current_namespace][ $el ], $text );
  299. }
  300. elseif ($this->inimage) {
  301. $this->concat(
  302. $this->image[ $this->current_namespace ][ $el ], $text );
  303. }
  304. }
  305. else {
  306. if ( $this->initem ) {
  307. $this->concat(
  308. $this->current_item[ $el ], $text);
  309. }
  310. elseif ($this->intextinput) {
  311. $this->concat(
  312. $this->textinput[ $el ], $text );
  313. }
  314. elseif ($this->inimage) {
  315. $this->concat(
  316. $this->image[ $el ], $text );
  317. }
  318. elseif ($this->inchannel) {
  319. $this->concat(
  320. $this->channel[ $el ], $text );
  321. }
  322. }
  323. }
  324. function normalize () {
  325. // if atom populate rss fields
  326. if ( $this->is_atom() ) {
  327. $this->channel['description'] = $this->channel['tagline'];
  328. for ( $i = 0; $i < count($this->items); $i++) {
  329. $item = $this->items[$i];
  330. if ( isset($item['summary']) )
  331. $item['description'] = $item['summary'];
  332. if ( isset($item['atom_content']))
  333. $item['content']['encoded'] = $item['atom_content'];
  334. $atom_date = (isset($item['issued']) ) ? $item['issued'] : $item['modified'];
  335. if ( $atom_date ) {
  336. $epoch = @parse_w3cdtf($atom_date);
  337. if ($epoch and $epoch > 0) {
  338. $item['date_timestamp'] = $epoch;
  339. }
  340. }
  341. $this->items[$i] = $item;
  342. }
  343. }
  344. elseif ( $this->is_rss() ) {
  345. $this->channel['tagline'] = $this->channel['description'];
  346. for ( $i = 0; $i < count($this->items); $i++) {
  347. $item = $this->items[$i];
  348. if ( isset($item['description']))
  349. $item['summary'] = $item['description'];
  350. if ( isset($item['content']['encoded'] ) )
  351. $item['atom_content'] = $item['content']['encoded'];
  352. if ( $this->is_rss() == '1.0' and isset($item['dc']['date']) ) {
  353. $epoch = @parse_w3cdtf($item['dc']['date']);
  354. if ($epoch and $epoch > 0) {
  355. $item['date_timestamp'] = $epoch;
  356. }
  357. }
  358. elseif ( isset($item['pubdate']) ) {
  359. $epoch = @strtotime($item['pubdate']);
  360. if ($epoch > 0) {
  361. $item['date_timestamp'] = $epoch;
  362. }
  363. }
  364. $this->items[$i] = $item;
  365. }
  366. }
  367. }
  368. function is_rss () {
  369. if ( $this->feed_type == RSS ) {
  370. return $this->feed_version;
  371. }
  372. else {
  373. return false;
  374. }
  375. }
  376. function is_atom() {
  377. if ( $this->feed_type == ATOM ) {
  378. return $this->feed_version;
  379. }
  380. else {
  381. return false;
  382. }
  383. }
  384. /**
  385. * return XML parser, and possibly re-encoded source
  386. *
  387. */
  388. function create_parser($source, $out_enc, $in_enc, $detect) {
  389. if ( substr(phpversion(),0,1) == 5) {
  390. $parser = $this->php5_create_parser($in_enc, $detect);
  391. }
  392. else {
  393. list($parser, $source) = $this->php4_create_parser($source, $in_enc, $detect);
  394. }
  395. if ($out_enc) {
  396. $this->encoding = $out_enc;
  397. xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $out_enc);
  398. }
  399. return array($parser, $source);
  400. }
  401. /**
  402. * Instantiate an XML parser under PHP5
  403. *
  404. * PHP5 will do a fine job of detecting input encoding
  405. * if passed an empty string as the encoding.
  406. *
  407. * All hail libxml2!
  408. *
  409. */
  410. function php5_create_parser($in_enc, $detect) {
  411. // by default php5 does a fine job of detecting input encodings
  412. if(!$detect && $in_enc) {
  413. return xml_parser_create($in_enc);
  414. }
  415. else {
  416. return xml_parser_create('');
  417. }
  418. }
  419. /**
  420. * Instaniate an XML parser under PHP4
  421. *
  422. * Unfortunately PHP4's support for character encodings
  423. * and especially XML and character encodings sucks. As
  424. * long as the documents you parse only contain characters
  425. * from the ISO-8859-1 character set (a superset of ASCII,
  426. * and a subset of UTF-8) you're fine. However once you
  427. * step out of that comfy little world things get mad, bad,
  428. * and dangerous to know.
  429. *
  430. * The following code is based on SJM's work with FoF
  431. * @see http://minutillo.com/steve/weblog/2004/6/17/php-xml-and-character-encodings-a-tale-of-sadness-rage-and-data-loss
  432. *
  433. */
  434. function php4_create_parser($source, $in_enc, $detect) {
  435. if ( !$detect ) {
  436. return array(xml_parser_create($in_enc), $source);
  437. }
  438. if (!$in_enc) {
  439. if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $source, $m)) {
  440. $in_enc = strtoupper($m[1]);
  441. $this->source_encoding = $in_enc;
  442. }
  443. else {
  444. $in_enc = 'UTF-8';
  445. }
  446. }
  447. if ($this->known_encoding($in_enc)) {
  448. return array(xml_parser_create($in_enc), $source);
  449. }
  450. // the dectected encoding is not one of the simple encodings PHP knows
  451. // attempt to use the iconv extension to
  452. // cast the XML to a known encoding
  453. // @see http://php.net/iconv
  454. if (function_exists('iconv')) {
  455. $encoded_source = iconv($in_enc,'UTF-8', $source);
  456. if ($encoded_source) {
  457. return array(xml_parser_create('UTF-8'), $encoded_source);
  458. }
  459. }
  460. // iconv didn't work, try mb_convert_encoding
  461. // @see http://php.net/mbstring
  462. if(function_exists('mb_convert_encoding')) {
  463. $encoded_source = mb_convert_encoding($source, 'UTF-8', $in_enc );
  464. if ($encoded_source) {
  465. return array(xml_parser_create('UTF-8'), $encoded_source);
  466. }
  467. }
  468. // else
  469. $this->error("Feed is in an unsupported character encoding. ($in_enc) " .
  470. "You may see strange artifacts, and mangled characters.",
  471. E_USER_NOTICE);
  472. return array(xml_parser_create(), $source);
  473. }
  474. function known_encoding($enc) {
  475. $enc = strtoupper($enc);
  476. if ( in_array($enc, $this->_KNOWN_ENCODINGS) ) {
  477. return $enc;
  478. }
  479. else {
  480. return false;
  481. }
  482. }
  483. function error ($errormsg, $lvl=E_USER_WARNING) {
  484. // append PHP's error message if track_errors enabled
  485. if ( isset($php_errormsg) ) {
  486. $errormsg .= " ($php_errormsg)";
  487. }
  488. if ( MAGPIE_DEBUG ) {
  489. trigger_error( $errormsg, $lvl);
  490. }
  491. else {
  492. error_log( $errormsg, 0);
  493. }
  494. $notices = E_USER_NOTICE|E_NOTICE;
  495. if ( $lvl&$notices ) {
  496. $this->WARNING = $errormsg;
  497. } else {
  498. $this->ERROR = $errormsg;
  499. }
  500. }
  501. } // end class RSS
  502. function map_attrs($k, $v) {
  503. return "$k=\"$v\"";
  504. }
  505. // patch to support medieval versions of PHP4.1.x,
  506. // courtesy, Ryan Currie, ryan@digibliss.com
  507. if (!function_exists('array_change_key_case')) {
  508. define("CASE_UPPER",1);
  509. define("CASE_LOWER",0);
  510. function array_change_key_case($array,$case=CASE_LOWER) {
  511. $cmd = '';
  512. $output = '';
  513. if ($case=CASE_LOWER) $cmd=strtolower;
  514. elseif ($case=CASE_UPPER) $cmd=strtoupper;
  515. foreach($array as $key=>$value) {
  516. $output[$cmd($key)]=$value;
  517. }
  518. return $output;
  519. }
  520. }