#This program pulls up the apod website. It uses lxml to iterate over the html #tags till it finds the image tag. It downloads and saves the image. #Then it uses lxml to strip the text from the page. Finally it slices the image #description block from the text and saves it as a text file. #This programs depends on python3 and the lxml package. #www.fallenhobbit.com #fallenhobbit@gmail.com #module imports from urllib.request import urlopen import io from lxml import etree from lxml import html from lxml.html.clean import clean_html #Saves image file scraped from url def saveImg(img_url): with open("apod.jpg","wb") as f: r = urlopen(img_url).read() #open file f.write(r) #write file f.close() #close file #Strip the explanation text from the web page def stripText(text_url): tree = html.parse(text_url) #create parse tree of html file tree = clean_html(tree) #strip html tags text = tree.getroot().text_content() #get all the text return " ".join(text.split()) #fix whitespace problems #locates and returns image description block within the page def findBlock(stripped_text): x = stripped_text.find("Explanation") #start of image explanation block y = stripped_text.find("Tomorrow's picture:") #end of explanation block return stripped_text[x:y] #saves stripped text as a text file def saveBlock(stripped_text): with open("apod.txt","w") as f: #open file f.write(stripped_text) #write file f.close() #close file #Scrapes image file from url def stripImage(): apod = "http://apod.nasa.gov/apod/" #url to strip u = io.TextIOWrapper(urlopen(apod), encoding='latin1') #open text stream handle to url text = u.read() #read the url parser = etree.HTMLParser() #create html parser tree = etree.fromstring(text, parser) #create parse tree for elt in tree.getiterator(): #iterate over tree if elt.tag == "img": if elt.attrib.has_key("src"): #once we find the image tag img_url = apod + elt.get("src") saveImg(img_url) #download it break def main(): #main function that calls everything else stripImage() saveBlock(findBlock(stripText("http://apod.nasa.gov/apod/"))) if __name__ == "__main__": main()