from RDF import *
from sets import Set
from robotparser import RobotFileParser
from twisted.web import client
from twisted.internet import reactor

import sha,urlparse, random, re

class Scutter(object):
    def __init__(self,plan,observers,emptyqueue=None):
        self.plan = plan
        self.observers = observers
        if emptyqueue is not None:
            self.emptyqueue = emptyqueue

    def scut(self):
        next = self.plan.next()
        if next is not None:
            print "Queuing",next
            deferred = client.getPage(next)
            deferred.addCallbacks(callback = PageGot(next,self.observers).handle)
            reactor.callLater(5,self.scut)
        else:
            print "Queue is empty"
            reactor.callLater(1,self.emptyqueue)
            reactor.callLater(5,self.scut)

    def error(self,value):
        print "Error! - ",
        print value

class Observers(object):
    def __init__(self):
        self.observers = []

    def add(self,observer):
        self.observers.append(observer)

    def distribute(self,origin,model):
        current_model = model
        for o in self.observers:
            current_model = o.observe(origin,current_model)

class PageGot(object):
    def __init__(self,uri,observers):
        self.uri = uri
        self.observers = observers
        self.parser = Parser()
    def handle(self,value):
        newmodel = Model()
        print "Got from", self.uri
        try:
            self.parser.parse_string_into_model(newmodel,value,Uri(self.uri))
            self.observers.distribute(self.uri,newmodel)
        except RuntimeWarning:
            print "Couldn't parse",uri

class JosekiPoster(object):
    counter = 0 
    def __init__(self,url):
        self.joseki_url = url

    def observe(self,origin,incoming):
        altered_model = Model()
        bnodes = NS("http://hackdiary.com/ns/bnodes#")
        ntriples = ""
        JosekiPoster.counter += 1
        for s in incoming:
            if s.subject.is_blank():
                subject = bnodes[s.subject.blank_identifier]
            else:
                subject = s.subject
            if s.object.is_blank():
                object = bnodes[s.object.blank_identifier]
            else:
                object = s.object

            altered_model.append(Statement(subject,s.predicate,object))

        s = Serializer(mime_type="text/plain")
        s.serialize_model_to_file("/tmp/out.nt",altered_model)

        i = open("/tmp/out.nt")
        
        ntriples = ""
        data = i.read(1024000)
        while len(data)>0:
            ntriples += data
            data = i.read(1024000)

        def done(value):
            print "POSTed to Joseki"

        headers = { "Content-type": "application/n-triples" }
        deferred = client.getPage(self.joseki_url+"?add",
                method="POST",
                postdata=ntriples,
                headers=headers)
        deferred.addCallbacks(callback = done)

        return incoming

    def as_ntriple(self,statement):
        ntriple = ""
        if statement.subject.is_blank():
            ntriple += "_:"+statement.subject.blank_identifier
        else:
            ntriple += "<"+str(statement.subject.uri)+">"

        ntriple += " "
        ntriple += "<"+str(statement.predicate.uri)+">"
        ntriple += " "

        if statement.object.is_blank():
            ntriple += "_:"+statement.object.blank_identifier
        else:
            if statement.object.is_literal():
                literal = str(statement.object)
                literal = re.sub('"','\\"',literal)
                literal = re.sub("\n","\\\\n",literal)
                ntriple += '"'+literal+'"'
            else:
                ntriple += "<"+str(statement.object.uri)+">"
        ntriple += " ."

        return ntriple

class MboxSha1Normalise(object):
    def observe(self,origin,incoming):
        for s in incoming:
            if str(s.predicate.uri) == 'http://xmlns.com/foaf/0.1/mbox':
                if s.object.is_resource():
                    sha1val=sha.new(str(s.object.uri))
                    s2=Statement(s.subject,
                            Uri("http://xmlns.com/foaf/0.1/mbox_sha1sum"),
                            Node(literal=sha1val.hexdigest()))
                    print s,"to",s2
                    incoming.append(s2)

        return incoming

class Smusher(object):
    def __init__(self):
        self.canonical = {}
        self.owl = NS("http://www.w3.org/2002/07/owl#")
        self.scutterns = NS("http://hackdiary.com/scutter/")
        self.ifps = ["http://xmlns.com/foaf/0.1/mbox_sha1sum",
            "http://xmlns.com/foaf/0.1/mbox",
            "http://xmlns.com/foaf/0.1/weblog",
            "http://xmlns.com/foaf/0.1/homepage",
        ]

    def derive_canons(self,incoming,origin):
        canonical = {}
        rewrites_made = {}
        for s in incoming:
            if str(s.predicate.uri) in self.ifps:
                # blank literals are a source of accidental merging in IFPs
                if s.object.is_literal() and str(s.object) == '':
                    print "Found an IFP with a blank literal at",origin,"in",str(s)
                else:
                    pred_obj = (str(s.predicate),str(s.object))
                    if pred_obj in self.canonical:
                        canonical[str(s.subject)] = self.canonical[pred_obj]

                        if str(s.subject) not in rewrites_made:
                            rewrites_made[str(s.subject)] = []
                        rewrites_made[str(s.subject)].append(self.canonical[pred_obj])

                        print "Mapping",str(s.subject),"to",self.canonical[pred_obj]

                        self.equate(incoming,s.subject,self.canonical[pred_obj])
                    else:
                        self.canonical[pred_obj] = s.subject
                        print s.subject,"is made canonical for ifp",s.predicate,"with value",s.object

        multi_mapped_subjects = [s for s in rewrites_made if len(rewrites_made[s]) > 1]
        if len(multi_mapped_subjects)>0:
            for s in multi_mapped_subjects:
                for t in range(len(rewrites_made[s])-1):
                    self.equate(incoming,rewrites_made[s][t],rewrites_made[s][t+1])

        return canonical

    def equate(self,model,uri1,uri2):
        if not uri1 == uri2:
            model.append(Statement(uri1,
                self.scutterns.mergeWith,
                uri2))
            print(Statement(uri1,
                self.scutterns.mergeWith,
                uri2))

    def observe(self,origin,incoming):
        local_canonical = self.derive_canons(incoming,origin)

        return incoming

class MergeSameAs(object):
    def __init__(self):
        self.owl = NS("http://www.w3.org/2002/07/owl#")
        self.scutterns = NS("http://hackdiary.com/scutter/")

    def canonicalise(self,statement,canonical):
        if statement.predicate == self.scutterns.mergeWith:
            return statement
        if statement.predicate == self.owl.sameAs:
            return statement

        if statement.subject in canonical:
            subject = canonical[statement.subject]
            #print "Rewrote subject of",statement," to ",subject
        else:
            subject = statement.subject

        if statement.object in canonical:
            object = canonical[statement.object]
            #print "Rewrote object of",statement," to ",object
        else:
            object = statement.object

        return Statement(subject,statement.predicate,object)

    def observe(self,origin,incoming):
        newmodel = Model()
        rewrites = dict([(s.subject,s.object) 
                for s 
                in incoming.find_statements(Statement(None,self.scutterns.mergeWith,None))])

        for s in incoming:
            canonical_statement = self.canonicalise(s,rewrites)
            newmodel.add_statement(canonical_statement)
        
        return newmodel

class Aggregator(object):
    def __init__(self):
        storage=Storage(storage_name="hashes",
            name='aggregate',
            options_string="hash-type='bdb',contexts='yes',index-subjects='yes',index-objects='yes',index-predicates='yes',dir='.'")
        self.model = Model(storage)
        self.count = 0
        for s in self.model:
            self.count += 1

    def observe(self,origin,incoming):
        origin_node = Node(uri_string=origin)

        remove_count = 0
        for s in self.model.as_stream(context=origin_node):
            remove_count += 1

        if remove_count > 0:
            self.count -= remove_count
            print "Removing",remove_count,"statements from",origin
            self.model.remove_statements_with_context(origin_node)

        for s in incoming:
            self.model.add_statement(s,context=origin_node)
            self.count += 1
        print "Count:",self.count
        return incoming

    def dump(self):
        print "Dumping..."
        s = Serializer()
        s.serialize_model_to_file("dump.rdf",self.model)
        #reactor.stop()
        #reactor.callLater(3600,agg.dump,"dump.rdf")

class Scutterplan(object):
    def __init__(self,initial_list):
        self.robots = {}
        self.rdfs = NS("http://www.w3.org/2000/01/rdf-schema#")
        self.list = initial_list
        self.seen = Set(self.list)

    def next(self):
        if len(self.list) == 0:
            return None
        else:
            return self.list.pop(random.randint(0,len(self.list)-1))

    def observe(self,origin,incoming):
        for statement in incoming:
            if statement.predicate == self.rdfs.seeAlso:
                self.add_to_plan(origin,statement.object)
        return incoming

    def blacklisted(self,uri):
        blacklist = [
            #"http://www.sentient.co.uk/foaf/",
            #"http://www.ecademy.com",
            #"http://triplestore.aktors.org"
            ]
        for prefix in blacklist:
            if uri.startswith(prefix):
                self.seen.add(uri)
                return True
        return False

    def banned(self,uri):
        host = urlparse.urlsplit(uri)[1]
        robots = "http://"+host+"/robots.txt"

        if robots not in self.robots:
            self.robots[robots] = RobotFileParser()
            self.robots[robots].set_url(robots)
            self.robots[robots].read()

        if self.robots[robots].can_fetch("*",uri):
            return False
        else:
            return True

    def add_to_plan(self,origin,uri_node):
        if uri_node.is_resource():
            uri = str(uri_node.uri)
            if self.have_seen(uri):
                print "Already seen:",uri
                return
            if self.blacklisted(uri):
                print "Blacklisted:",uri
                return
            if self.banned(uri):
                print "Banned:",uri
                return

            self.list.append(uri)
            print "From",origin,"to",uri

    def have_seen(self,uri):
        if uri in self.seen:
            return True
        else:
            self.seen.add(uri)
            return False

def main():
    from scutter import Aggregator,Scutterplan,Observers,MboxSha1Normalise,Scutter,Smusher
    import sys

    plan = Scutterplan(sys.argv[1:])

    agg = Aggregator()

    observers = Observers()
    observers.add(plan)
    observers.add(MboxSha1Normalise())
    observers.add(Smusher())
    observers.add(MergeSameAs())
    #observers.add(JosekiPoster("http://localhost:2020/db"))
    #from simpledb import SimpleDB
    #observers.add(SimpleDB(host="localhost",user="rdf",passwd="rdf",db="rdf"))
    observers.add(agg)
    scutter = Scutter(plan,observers,agg.dump)

    reactor.callLater(0.2,scutter.scut)

    reactor.run()

if __name__ == '__main__':
    main()
