MapReduce: Split a txt file into multiple files based on a pattern in a file -
i have tab separated .txt file this:
05-12-2011 02:00:00 xyzz
05-12-2011 02:01:00 xyzz
05-12-2011 02:02:00 xyzz
05-12-2011 02:03:00 xyzz
05-12-2011 02:04:00 abcd
05-12-2011 02:05:00 abcd
05-12-2011 02:06:00 abcd
05-12-2011 02:07:00 xyzz
05-12-2011 02:08:00 abcd
i want write info different files such pattern "xyzz" 1 file , "abcd" file.
file1.txt contain:
05-12-2011 02:01:00 xyzz
05-12-2011 02:02:00 xyzz
05-12-2011 02:03:00 xyzz
05-12-2011 02:07:00 xyzz
and file2.txt contain:
05-12-2011 02:04:00 abcd
05-12-2011 02:05:00 abcd
05-12-2011 02:06:00 abcd
here's code want share.
public class wordcount2 { public static class tokenizermapper2 extends mapper<object, text, text, intwritable>{ private final static intwritable 1 = new intwritable(1); private text word = new text(); public void map(object key, text value, context context ) throws ioexception, interruptedexception { stringtokenizer itr = new stringtokenizer(value.tostring()); while (itr.hasmoretokens()) { word.set(itr.nexttoken()); context.write(word, one); } } } public static class intsumreducer2 extends reducer<text,intwritable,text,intwritable> { private intwritable result = new intwritable(); public void reduce(text key, iterable<intwritable> values, context context ) throws ioexception, interruptedexception { /* int sum = 0; (intwritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result);*/ } } public static void main(string[] args) throws exception { configuration conf = new configuration(); string line; string arguements[]; string[] otherargs = new genericoptionsparser(conf, args).getremainingargs(); // calculating total number of attributes in file filereader infile = new filereader(args[0]); bufferedreader bufread = new bufferedreader(infile); line = bufread.readline(); arguements = line.split(","); //for spliting fields separated comma conf.setint("argno", arguements.length); // saving attribute value job job = new job(conf, "word count"); job.setjarbyclass(wordcount.class); job.setmapperclass(tokenizermapper.class); job.setcombinerclass(intsumreducer.class); job.setreducerclass(intsumreducer.class); job.setoutputkeyclass(text.class); job.setoutputvalueclass(intwritable.class); fileinputformat.addinputpath(job, new path(otherargs[0])); fileoutputformat.setoutputpath(job, new path(otherargs[1])); system.exit(job.waitforcompletion(true) ? 0 : 1); } } mapreduce
No comments:
Post a Comment