#Downloads addresses from: #http://www.summet.com/dmsi/html/codesamples/addresses.html import urllib.request from re import findall url = "http://www.summet.com/dmsi/html/codesamples/addresses.html" response = urllib.request.urlopen(url) html = response.read() htmlStr = str(html) #Previous work that builds up to the final solution below: #Find all phone numbers. Note how we have to escape the #curved brackets(parenthesis) with slashes, as they have a #special RegEx meaning. #pdata = findall("\(\d{3}\) \d{3}-\d{4}", htmlStr) #for item in pdata: # print(item) #Find all the names. Use the
  • Firstname Lastname
    pattern. #Note how we use curved brackets to isolate and pull out just the #name data, leaving the bracketing HMTL tags
  • and
    alone. #ndata = findall("
  • ([A-Za-z]+ [A-Za-z]+)
    ", htmlStr) #Example which captures the first and last name separately: #ndata = findall("
  • ([A-Za-z]+) ([A-Za-z]+)
    ", htmlStr) #for item in ndata: # print(item) # We could try to do the following but it would match only the first name #and the last phone number! #
  • (\S+ \S+)
    .*
    (\(\d{3}\) \d{3}-\d{4})
  • #Note that on gskinner.com/RegExr the above works, but it doesn't work #in python! #data = findall("
  • (\S+ \S+)
    .*
    (\(\d{3}\) \d{3}-\d{4})
  • ", htmlStr) #Now, find the names, and keep them linked to the phone numbers. #We have to "match" two interveaning lines of address/city,state/zip #but NOT capture those lines. We use a non-capturing group to find #one or more line. # NOTE: We have to use [^<]* to match everything up until the
    # tags if we want to use exact numbers...if we use .* it will match many of them! data = findall("
  • (\S+ \S+)
    [^<]*
    [^<]*
    (\(\d{3}\) \d{3}-\d{4})
  • ", htmlStr) for item in data: print(item) print("Done!")