# CSE 143, Winter 2010, Marty Stepp # Homework 3: HTML Validator (Python) # # Instructor-provided code. # This program tests your HTML validator object on any file or URL you want. # # When it prompts you for a file name, if you type a simple string such # as "test1.html" (without the quotes) it will just look on your hard disk # in the same directory as your code or Eclipse project. # # If you type a string such as "http://www.google.com/index.html", it will # connect to that URL and download the HTML content from it. from htmltag import * from htmlvalidator import * import re import urllib # Reads the file or URL given, and tokenizes the text in that file, # placing the tokens into the given list. # You don't need to call this method in your homework code. # Precondition: address represents a valid file/URL def tokenize(text): queue = re.findall("<[^>]+>", text) for i in range(len(queue)): # '' --> 'foo' tag = re.sub("[<>]|([ \t\n\f].*)", "", queue[i]) if "/" in tag: tag = re.sub("/", "", tag) queue[i] = HtmlTag(tag, False) else: queue[i] = HtmlTag(tag, True) return queue # main validator = None page_text = "" choice = "s" while True: if choice.startswith("s"): # prompt for page, then download it if it's a URL url = input("Page URL or file name (blank for empty): ") page_text = "" if (url.startswith("http:")): print("Downloading from " + url + "...") page_text = urllib.urlopen(url).read() elif len(url) > 0: page_text = open(url).read() # create/update the HTML validator if len(page_text) > 0: tags = tokenize(page_text) validator = HtmlValidator(tags) else: validator = HtmlValidator() elif choice.startswith("p"): print(page_text) elif choice.startswith("a"): tag_text = input("Tag? ") is_open_tag = not "