from RDF import *
from urllib2 import urlopen
from urlparse import urljoin
from sgmllib import SGMLParser
import sys

class LinkParser(SGMLParser):
    def reset(self):
        SGMLParser.reset(self)
        self.href = []
        
    def do_link(self, attrs):
        if not ('rel', 'meta') in attrs: return
        if not ('type', 'application/rdf+xml') in attrs: return
        hreflist = [e[1] for e in attrs if e[0]=='href']
        if hreflist:
            self.href.append(hreflist[0])
    
    def end_head(self, attrs):
        self.setnomoretags()

    start_body = end_head

def getRDFLinkFromHTMLSource(htmlSource):
    try:
        parser = LinkParser()
        parser.feed(htmlSource)
        return parser.href
    except:
        return []

m = Model()
p = Parser()
sys.stderr.write("Loading blogroll..."+"\n")
p.parse_into_model(m,"http://journal.dajobe.org/journal/2003/07/semblogs/bloggers.rdf")

found_rdf = []

for t in m.find_statements(
        Statement(None,Uri("http://www.w3.org/2000/01/rdf-schema#seeAlso"),None)):
    u = str(t.object.uri)
    r = Model()
    sys.stderr.write("Grabbing "+u+"\n")
    p.parse_into_model(r,u)
    for t2 in r.find_statements(
            Statement(None,
                Uri("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"),
                Uri("http://purl.org/rss/1.0/channel"))):
        if t2.subject.is_resource():
            url = str(t2.subject.uri)
            sys.stderr.write("Getting HTML from "+url+"\n")
            urldata = urlopen(url)
            content = urldata.read()
            for rdf in getRDFLinkFromHTMLSource(content):
                rdf = urljoin(url,rdf)
                found_rdf.append((t.subject,rdf))
                sys.stderr.write("Found RDF: "+rdf+"\n")

g = Model()
for rdf in found_rdf:
    sys.stderr.write("Grabbing RDF: "+rdf[1]+"\n")
    p.parse_into_model(g,rdf[1])
    m.add_statement(Statement(rdf[0],Uri("http://www.w3.org/2000/01/rdf-schema#seeAlso"),Uri(rdf[1])))

sys.stderr.write("Serializing to result.rdf...\n")
s = Serializer()
s.serialize_model_to_file("gathered.rdf",g)

sys.stderr.write("Serializing to augmented-blogroll.rdf...\n")
s.serialize_model_to_file("augmented-blogroll.rdf",m)
