#!/usr/bin/python

# HTExploit - Written by Matias Katz (@matiaskatz)
# Presented at Black Hat US 2012 Conference
# Team Leader: Maximiliano Soler (@maxisoler)
# Homepage: www.htexploit.org

# File 'lib/LinkScan.py' search each downloaded file for HREF links, and creates a new set of files to download, based on its findings.

import Conn, FullList
import httplib, os, sys, urllib2, urlparse
from HTMLParser import HTMLParser

links = ""
l = 0

def Scan(url,verbose,outdir):
	global links
        class Parser(HTMLParser):
		def handle_starttag(self, tag, attrs):
			global links, l#, tot_files, tot_iterations
			if tag == 'a':
		        	for (key, value) in attrs:
			            if key == 'href':
			                newUrl = urlparse.urljoin(url, value)
					links += value + "\n"

	l = 0
	url += "/"
	if verbose > 0:
		print "[+] Link Scan started"
	if not os.path.exists(outdir + "/LinkScan"):
                try:
                        os.makedirs(outdir + "/LinkScan")
                except:
                        print "[-] Error creating output directory, do you have permissions?"
                        sys.exit("[-] Exiting.\n")

	repfile = open(outdir + "/FullList/rawlist","r").read().split()
	for line in repfile:
		if verbose > 1:
			print "[+] Scanning '" + line.strip() + "'... ",
		fullurl = urlparse.urljoin(url,line)
		data = Conn.Connect(2,fullurl)
	        linkParse = Parser()
		linkParse.feed(data)
		if verbose > 1:
			print len(links.split()) , " link(s) found."

		for link in links.split():
			#if verbose > 1:
			#	print "    [+] Found link: '" + str(links).strip() + "'"
			rawfile = open(outdir + "/LinkScan/temprawlist","a")
			rawfile.writelines(link + "\n")
			rawfile.close()
			#outfile = open(outdir + "/LinkScan/file-" + str(i).zfill(8) + ".html", "w")
			l += 1
		links = ""

	if verbose > 0:
		print "\n[+] Removing duplicates..."
	lines_seen = set() # Holds lines already seen
	outfile = open(outdir + "/LinkScan/rawlist", "w")
	try:
		for line in open(outdir + "/LinkScan/temprawlist", "r"):
		    if line not in lines_seen: # Not a duplicate
		        outfile.write(line)
		        lines_seen.add(line)
			l = l - 1
		outfile.close()
		os.remove(outdir + "/LinkScan/temprawlist")
	except:
		pass

	if verbose > 0:
		print
		print "[+] Link Scan Completed"
		print "[+]" , l , "new links found"

	FullList.Scan(url,verbose,outdir,outdir + "/LinkScan/rawlist")
