#!/usr/bin/perl # This filter turns heise's atom feed into a proper rss feed with *content* # and is largely based on heise3rss by Lars Lindner, Balazs Melikant, and # Mathis Dirksen-Thedens. # # Importantly, it is now based on the list of news obtained from # http://www.heise.de/newsticker/heise-atom.xml # # The success of the parsing operation obviously depends on heise sticking # to their HTML markup. It is thus a hack (see lines 81 ff.), trying to find # the place where the actual article resides and dropping any ads in between. # And, it may break anytime heise chooses to alter their markup. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # Please refer to http://www.gnu.org/licenses/gpl.txt for the # GNU General Public License that applies to this program. my($contentblock) = 0; my($what) = 0; my(@news); my %urls_heads = (); my @lines = <>; # get last article number my $lastnr = `cat ~/.liferea/.heise3rss-last-nr 2>/dev/null`; if ($lastnr =~ /^(\d+)$/) { $lastnr = $1; } else { $lastnr = 0; } # gather links to article full texts foreach (@lines) { chomp; if ($_ =~ /.*?.*?/) { $contentblock = 1; next; } if ($_ =~ /.*?<\/feed>.*?/) { last; } if ($contentblock == 1) { if(/.*?(.*)<\/title>/) { $temp = $1; } if(/.*?<id>.*meldung\/(.*)<\/id>/) { $urls_heads{$1} = $temp; } } } # test for curl `curl --help`; if($? == 512) { # process collected links and get article texts foreach my $url (sort keys %urls_heads) { $url =~ /meldung\/(\d+)$/; if($lastnr < $url) { $lastnr = $url; @lines = `curl -s -L http://www.heise.de/newsticker/meldung/$url`; $rightregion = 0; $contentblock = 0; $content = ""; foreach(@lines) { # start of relevant block if($rightregion == 0 && $_ =~ /.*?HEISETEXT.*?/) { $rightregion= 1; $contentblock = 1; next; } # start (resume) if($rightregion == 1 && $_ =~ /.*?<p>.*?/) { $contentblock = 1; } if($rightregion == 1 && $_ =~ /.*?h2>.*?/) { $contentblock = 1; } if($rightregion == 1 && $_ =~ /.*?href=\"mailto.*?/) { $contentblock = 1; } # skip if($_ =~ /.*?cadv.*?/ || $_ =~ /.*?<script.*?/ || $_ =~ /.*?<div\ class=\"ISI_IGNORE\">.*?/) { $contentblock = 0; next; } # end of relevant block last if($rightregion == 1 && $_ =~ m"/HEISETEXT"); # add line to content if($contentblock == 1) { $content .= $_; } } # stack the individual news push(@news, $urls_heads{$url}); push(@news, $url); push(@news, "<![CDATA[$content]]>"); } } } else { push(@news, "Filter script failed! curl not found!"); push(@news, " "); push(@news, "This filter script uses curl to download websites. And it seems like curl is not installed!\n"); } # print rss feed print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n". "<rdf:RDF\n". "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n". "xmlns:content=\"http://purl.org/rss/1.0/modules/content/\"\n". "xmlns=\"http://my.netscape.com/rdf/simple/0.9/\">\n". "<channel>\n". " <title>heise online news\n". " http://www.heise.de/newsticker/\n". " Nachrichten aus der Welt des Computers\n". "\n"; $what = 0; foreach (@news) { if ($_ eq "") { next; } if ($what == 0) { print "\n$_\n"; $what = 1; next; } elsif ($what == 1) { print "http://www.heise.de/newsticker/meldung/$_\n"; $what = 2; next; } else { print "Sorry but your reader must support the content namespace!\n"; print "$_\n\n"; $what = 0; } } print "\n"; # write last article number, so that we can start off from there next time around `test -d ~/.liferea || mkdir ~/.liferea`; `echo $lastnr > ~/.liferea/.heise3rss-last-nr 2>/dev/null`;