#!/usr/bin/env python

# printURLS.py

import sys
from sets import Set

from urllib import urlopen
from urlparse import urlparse, urljoin
from HTMLParser import HTMLParser

class HrefParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.__refs = []

    def handle_starttag(self, tag, attrs):
        if tag.lower() == 'a':
            for name,value in attrs:
                if name.lower() == "href":
                    self.__refs.append(value)
                    break

    def hrefs(self):
        return self.__refs

def buildURLS(baseURL, refs):
    urls = Set()
    for ref in refs:
        protocol = urlparse(ref)[0]
        if protocol == '':
            urls.add(urljoin(baseURL, ref))
        elif protocol.lower() == 'http':
            urls.add(ref)
    return urls
        

if __name__ == '__main__':
    parser = HrefParser()
    url = sys.argv[1]
    if not url.startswith("http://"):
        url = "http://" + url
    page = urlopen(url).read()
    parser.feed(page)
    for url in buildURLS(url, parser.hrefs()): print url

