import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Scanner;
import java.util.Set;

public class WebpageParser {
	private static Set<String> ignoreWords;

	/**
	 * Given a website, returns a list of keywords.
	 * @param location The address of the website.
	 * @return A list of keywords
	 */
	public static List<String> getWordList(URL location) {
		setIgnoredWords();
		try {
			BufferedReader buf = readWebsite(location);
			List<String> wordList = readWords(buf);
			return wordList;
		} catch (IOException ioe) {}
		return new ArrayList<String>();
	}

	/**
	 * Given a website, returns a list of addresses it links to.
	 * @param location The address of the webpage
	 * @param max the maximum number of links to return
	 * @return a list with link addresses
	 */
	public static Set<URL> getLinkList(URL location, int max) {
		System.out.println("processing: " + location);
		try {
			BufferedReader buf;
			if (location != null)
				buf = readWebsite(location);
			else
				return new HashSet<URL>();
	
			Set<URL> linkSet = readLinks(buf, max);
			return linkSet;
		} catch (IOException ioe) {}
		return new HashSet<URL>();
	}

	private static BufferedReader readWebsite(URL url)
			throws MalformedURLException, IOException {
		URLConnection urlConn;
		BufferedReader buf;

		urlConn = url.openConnection();
		urlConn.setDoInput(true);
		urlConn.setUseCaches(false);

		buf = new BufferedReader(new InputStreamReader(urlConn.getInputStream()));
		return buf;
	}

	private static List<String> readWords(BufferedReader buf) {
		List<String> wordList = new ArrayList<String>();
		Scanner scan = new Scanner(buf);
		int inTag = 0;
		while (scan.hasNext()) {
			String next = scan.next();
			next = next.replaceFirst("<.*>", "");
			if (next.contains("<"))
				inTag++;
			if (inTag == 0 && !ignoreWords.contains(next.toLowerCase())
					&& next.matches("\\w\\w\\w\\w+") && !next.matches("\\d+px")
					&& !next.matches("\\d+")) {
				wordList.add(next.toLowerCase().replaceAll(
						"_;:?[=\\+-/,\\.'!@#$%^&\\*()\\s]", ""));
			}
			if (next.contains(">"))
				inTag--;

		}
		scan.close();
		return wordList;
	}

	private static Set<URL> readLinks(BufferedReader buf, int max) {
		Set<URL> linkSet = new HashSet<URL>();
		Scanner scan = new Scanner(buf);
		while (scan.hasNext() && linkSet.size() < max) {
			String next = scan.next();
			if (next.contains("href=\"http") && !next.contains("pdf")  && !next.contains("ps")  && !next.contains("ppt")  && !next.contains("mp3")  && !next.contains("wmv") && !next.contains("mp4")) {
				int start = next.indexOf("\"")+1;
				int end = next.indexOf("\"",start+1);
				if(start > -1 && end >-1 && end > start){
					try{
						linkSet.add(new URL(next.substring(start, end)));
					}catch (MalformedURLException e) {
						continue;
					}
				}
			}
		}
		scan.close();
		return linkSet;
	}
	
	/**
	 * A list of common words to ignor as keywords
	 */
	private static void setIgnoredWords() {
		if (ignoreWords != null)
			return;
		ignoreWords = new HashSet<String>();
		ignoreWords.add("a");
		ignoreWords.add("aboard");
		ignoreWords.add("about");
		ignoreWords.add("above");
		ignoreWords.add("across");
		ignoreWords.add("after");
		ignoreWords.add("against");
		ignoreWords.add("all");
		ignoreWords.add("along");
		ignoreWords.add("although");
		ignoreWords.add("amid");
		ignoreWords.add("among");
		ignoreWords.add("and");
		ignoreWords.add("another");
		ignoreWords.add("anti");
		ignoreWords.add("any");
		ignoreWords.add("anybody");
		ignoreWords.add("anyone");
		ignoreWords.add("anything");
		ignoreWords.add("are");
		ignoreWords.add("around");
		ignoreWords.add("as");
		ignoreWords.add("at");
		ignoreWords.add("because");
		ignoreWords.add("before");
		ignoreWords.add("behind");
		ignoreWords.add("below");
		ignoreWords.add("beneath");
		ignoreWords.add("beside");
		ignoreWords.add("besides");
		ignoreWords.add("between");
		ignoreWords.add("beyond");
		ignoreWords.add("both");
		ignoreWords.add("but");
		ignoreWords.add("by");
		ignoreWords.add("concerning");
		ignoreWords.add("considering");
		ignoreWords.add("despite");
		ignoreWords.add("down");
		ignoreWords.add("during");
		ignoreWords.add("each");
		ignoreWords.add("either");
		ignoreWords.add("else");
		ignoreWords.add("even");
		ignoreWords.add("everybody");
		ignoreWords.add("everyone");
		ignoreWords.add("everything");
		ignoreWords.add("except");
		ignoreWords.add("excepting");
		ignoreWords.add("excluding");
		ignoreWords.add("few");
		ignoreWords.add("following");
		ignoreWords.add("for");
		ignoreWords.add("from");
		ignoreWords.add("has");
		ignoreWords.add("he");
		ignoreWords.add("her");
		ignoreWords.add("hers");
		ignoreWords.add("herself");
		ignoreWords.add("him");
		ignoreWords.add("himself");
		ignoreWords.add("his");
		ignoreWords.add("how");
		ignoreWords.add("i");
		ignoreWords.add("if");
		ignoreWords.add("in");
		ignoreWords.add("inside");
		ignoreWords.add("into");
		ignoreWords.add("it");
		ignoreWords.add("its");
		ignoreWords.add("itself");
		ignoreWords.add("like");
		ignoreWords.add("little");
		ignoreWords.add("many");
		ignoreWords.add("me");
		ignoreWords.add("mine");
		ignoreWords.add("minus");
		ignoreWords.add("more");
		ignoreWords.add("most");
		ignoreWords.add("much");
		ignoreWords.add("myself");
		ignoreWords.add("near");
		ignoreWords.add("neither");
		ignoreWords.add("no");
		ignoreWords.add("nobody");
		ignoreWords.add("none");
		ignoreWords.add("nor");
		ignoreWords.add("nothing");
		ignoreWords.add("of");
		ignoreWords.add("off");
		ignoreWords.add("on");
		ignoreWords.add("once");
		ignoreWords.add("one");
		ignoreWords.add("onto");
		ignoreWords.add("opposite");
		ignoreWords.add("or");
		ignoreWords.add("other");
		ignoreWords.add("others");
		ignoreWords.add("ours");
		ignoreWords.add("ourselves");
		ignoreWords.add("outside");
		ignoreWords.add("over");
		ignoreWords.add("past");
		ignoreWords.add("per");
		ignoreWords.add("plus");
		ignoreWords.add("regarding");
		ignoreWords.add("round");
		ignoreWords.add("said");
		ignoreWords.add("save");
		ignoreWords.add("several");
		ignoreWords.add("she");
		ignoreWords.add("since");
		ignoreWords.add("so");
		ignoreWords.add("some");
		ignoreWords.add("somebody");
		ignoreWords.add("someone");
		ignoreWords.add("something");
		ignoreWords.add("than");
		ignoreWords.add("that");
		ignoreWords.add("theirs");
		ignoreWords.add("them");
		ignoreWords.add("themselves");
		ignoreWords.add("these");
		ignoreWords.add("they");
		ignoreWords.add("this");
		ignoreWords.add("those");
		ignoreWords.add("though");
		ignoreWords.add("through");
		ignoreWords.add("to");
		ignoreWords.add("toward");
		ignoreWords.add("towards");
		ignoreWords.add("under");
		ignoreWords.add("underneath");
		ignoreWords.add("unless");
		ignoreWords.add("unlike");
		ignoreWords.add("until");
		ignoreWords.add("up");
		ignoreWords.add("upon");
		ignoreWords.add("us");
		ignoreWords.add("versus");
		ignoreWords.add("via");
		ignoreWords.add("we");
		ignoreWords.add("what");
		ignoreWords.add("whatever");
		ignoreWords.add("when");
		ignoreWords.add("whenever");
		ignoreWords.add("where");
		ignoreWords.add("wherever");
		ignoreWords.add("which");
		ignoreWords.add("whichever");
		ignoreWords.add("while");
		ignoreWords.add("who");
		ignoreWords.add("whoever");
		ignoreWords.add("whom");
		ignoreWords.add("whomever");
		ignoreWords.add("whose");
		ignoreWords.add("why");
		ignoreWords.add("with");
		ignoreWords.add("within");
		ignoreWords.add("without");
		ignoreWords.add("will");
		ignoreWords.add("was");
		ignoreWords.add("yet");
		ignoreWords.add("you");
		ignoreWords.add("yours");
		ignoreWords.add("yourself");
		ignoreWords.add("yourselves");
		ignoreWords.add("an");
		ignoreWords.add("is");
		ignoreWords.add("the");
		ignoreWords.add("be");
		ignoreWords.add("been");
		ignoreWords.add("adsrc");
		ignoreWords.add("cms");
		ignoreWords.add("hideallads");
		ignoreWords.add("end");
		ignoreWords.add("sep");
		ignoreWords.add("newurl");
		ignoreWords.add("var");
		ignoreWords.add("module");
		ignoreWords.add("content");
		ignoreWords.add("solid");
		ignoreWords.add("html");
		ignoreWords.add("div");
		ignoreWords.add("box");
		ignoreWords.add("px");
		ignoreWords.add("right");
		ignoreWords.add("top");
		ignoreWords.add("img");
		ignoreWords.add("left");
		ignoreWords.add("header");
		ignoreWords.add("bottom");
		ignoreWords.add("styles");
		ignoreWords.add("input");
		ignoreWords.add("banner");
		ignoreWords.add("rail");
		ignoreWords.add("url");
		ignoreWords.add("null");
		ignoreWords.add("sidebar");
		ignoreWords.add("lwpquerystr");
		ignoreWords.add("your");
		ignoreWords.add("page");
		ignoreWords.add("view");
		ignoreWords.add("flash");
		ignoreWords.add("begin");
		ignoreWords.add("function");
		ignoreWords.add("instancebegineditable");
		ignoreWords.add("nowrap");
		ignoreWords.add("instanceparam");
	}
}
