# CSE 143, Winter 2009, Marty Stepp # Homework 2: HTML Validator (Python) # # Instructor-provided code. # This program tests your HTML validator object on any file or URL you want. # # When it prompts you for a file name, if you type a simple string such # as "test1.html" (without the quotes) it will just look on your hard disk # in the same directory as your code or Eclipse project. # # If you type a string such as "http://www.google.com/index.html", it will # connect to that URL and download the HTML content from it. from htmltag import * from htmlvalidator import * import re import urllib # Reads the file or URL given, and tokenizes the text in that file, # placing the tokens into the given list. # You don't need to call this method in your homework code. # Precondition: address represents a valid file/URL def tokenize(text): queue = re.findall("<[^>]+>", text) for i in range(len(queue)): # '' --> 'foo' tag = re.sub("[<>]|([ \t\n\f].*)", "", queue[i]) if "/" in tag: tag = re.sub("/", "", tag) queue[i] = HtmlTag(tag, False) else: queue[i] = HtmlTag(tag, True) return queue # main validator = None page_text = "" choice = "s" while True: if choice.startswith("s"): # prompt for page, then download it if it's a URL url = raw_input("Page URL or file name: ") if (url.startswith("http:")): print("Downloading from " + url + "...") page_text = urllib.urlopen(url).read() else: page_text = file(url).read() tags = tokenize(page_text) # create/update the HTML validator if validator == None: validator = HtmlValidator(tags) else: validator.tags = tags elif choice.startswith("p"): print(page_text) elif choice.startswith("g"): print("tags: " + re.sub("'", "", str(map(str, validator.tags)))) elif (choice.startswith("v")): result = validator.validate() print("") print("validate() returned " + str(result)) else: break print("") choice = raw_input("(g)etTags, (v)alidate, (s)et URL, (p)rint HTML, or (q)uit? ").lower()