import RDF
import time, os
import tidy
from xml.dom import minidom

file = 'http://apassant.net/home/2006/07/theclash/data/discography.n3'
base = 'http://en.wikipedia.org/wiki/The_Clash_discography'

m = RDF.Model()
p = RDF.Parser('turtle')
p.parse_into_model(m, file)

xml = '<data date-time-format="iso8601">'

## Get each item
for s in m.find_statements(RDF.Statement(None, RDF.Node(uri_string = "http://purl.org/dc/elements/1.1/title"), None)):
  uri = s.subject
  title = s.object.__str__()
  for s2 in m.find_statements(RDF.Statement(uri, RDF.Node(uri_string = "http://www.w3.org/1999/02/22-rdf-syntax-ns#about"), None)):
    link = s2.object.__str__()
  ## Get record cover
  if link[0:len(base)] != base :
    ## Cannot access wikipedia with urllib, need to wget
    os.system("wget '%s' -O tmpfile" %link)
    htmlfile = open('tmpfile', 'rb')
    options = dict(output_xhtml=1, add_xml_decl=1)   
    xhtml = tidy.parseString(htmlfile.read(), **options)
    dom = minidom.parseString(xhtml.__str__())
    images = dom.getElementsByTagName('img')
    img = images[0].attributes.get('src').value
  else :
    img = "http://upload.wikimedia.org/wikipedia/en/b/b7/Nocover.gif" 
  ## Format date, use 01/01/XXXX if only year is set
  for s2 in m.find_statements(RDF.Statement(uri, RDF.Node(uri_string = "http://purl.org/dc/elements/1.1/date"), None)):
    date = s2.object.__str__()
  if len(date) != 4 :
    date = time.strftime("%Y-%m-%d", time.strptime(date, "%B %d %Y"))
  else:
    date = "%s-01-01" %date
  xml += '<event start="%s" title="%s" link="%s">&lt;img src="%s"/&gt;</event>\n' %(date, title, link, img)

xml += '</data>'

print xml 

os.remove('tmpfile')

