
I have to: 1.Retrieve the document text from the web (provided by utility class) 2.Filter the desired "words" form the document, and one by one, store each word as a key into a Map<String,Integer> object where the value is the number of occurrences of the word 3. Read the (word, num_occurrences) map entry pairs into an array/list structure of your choice 4. sort pairlist in a manner which sorts by num_occurrences 5. print: the total number of words processed, the number of unique words, the N pairs which have the largest number of occurrences.
Here's what I have so far -- The first class is the WebDoc utility class and the second is the main class. I have added blocks of commented out sections in which the new code should go. please help!
package util;
import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.HTML;
import java.io.InputStreamReader;
import java.io.IOException;
import java.net.URL;
import java.net.MalformedURLException;
public class WebDoc {
public static String getBodyContent(String urlstr)
throws MalformedURLException, IOException {
/*
* The following convoluted code is necessary because getParser()
* is a protected method in HTMLEditorKit.
* We create an anonymous extension of HTMLEditorKit with a public
* getParser method calling the protected method of the superclass.
*/
HTMLEditorKit.Parser parser = new HTMLEditorKit() {
@Override
public HTMLEditorKit.Parser getParser() {
return super.getParser();
}
}.getParser();
class DocStatus {
public String content = "";
public boolean body_started = false;
}
final DocStatus status = new DocStatus();
HTMLEditorKit.ParserCallback callback = new HTMLEditorKit.ParserCallback() {
// handle the tags: look for the BODY tag
@Override
public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
if (t == HTML.Tag.BODY) {
status.body_started = true;
}
}
// handle the text between tags: concatenate all text after BODY tag
@Override
public void handleText(char[] text, int position) {
if (status.body_started) {
status.content += String.valueOf(text) + " ";
}
}
};
URL url = new URL(urlstr);
InputStreamReader r = new InputStreamReader(url.openStream());
parser.parse(r, callback, true);
return status.content;
}
}
package dsprog3;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import util.WebDoc;
public class DSProg3 {
public static void main(String[] args) {
String url;
//test URLs
url = "http://en.wikipedia.org/wiki/Jimi_Hendrix";
final int N = 25; //the number of word/frequency pairs to print
//word pattern recognizes a string of 5 or more letters
String word_pattern = "[A-Za-z]{5,}";
String content = null;
try {
content = WebDoc.getBodyContent(url); // get body of the web document
} catch (Exception ex) {
ex.printStackTrace();
System.exit(1);
}
Map<String,Integer> wordCount = new HashMap<String,Integer>();
int total_words = 0;
Matcher match = Pattern.compile(word_pattern).matcher(content);
while(match.find()){
++total_words;
//get the next word which matches the word_pattern
//and normalize it by making it lower case
String word = match.group().toLowerCase();
//System.out.println(word); //use this for testing
/**ADD CODE
*
* "register" one more occurrence of key, word, in the wordCount map
*/
}
//System.out.println(wordCount); //use this for testing
//use this class as is or modify it
class WordPair {
String word;
Integer count; // number of occurrences
WordPair(String word, Integer count) {
this.word = word;
this.count = count;
}
}
/**ADD CODE
*
* Create an array/list structure to hold WordPair objects
* Iterate through wordCount and store the Map entry pairs
* into the array/list structure
*/
/**ADD CODE
*
* Create a comparator for WordPair objects which compares by
* the count component
*
* Then sort the array/list using this comparator
*/
/**ADD CODE
*
* Print
* total_words
* # of unique words
* the N entries in the array/list corresponding to the
* pairs with the highest count values
*/
}
}
If you are facing any programming issue, such as compilation errors or not able to find the code you are looking for.
Ask your questions, our development team will try to give answers to your questions.