import java.io.IOException; import java.util.regex.Pattern; import java.util.regex.Matcher; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class WordCountMapper extends Mapper { // create these guys up here for speed private final static IntWritable one = new IntWritable(1); private Text word = new Text(); // create a Pattern object to parse each line private final Pattern wordPattern = Pattern.compile ("[a-zA-Z][a-zA-Z0-9]+"); public void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException { // get a String version of the line String line = value.toString(); // and a Matcher to parse it Matcher myMatcher = wordPattern.matcher (line); // while there are more tokens in the line while (myMatcher.find ()) { // get the next pattern, and convert it to lower case String returnVal = myMatcher.group (); returnVal = new String (returnVal.toLowerCase ()); word.set (returnVal); // send it on to the reducer! context.write (word, one); } } }