Listing Files in Directory
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; public class MapReduceDriver extends Configured implements Tool { public static void main(String[] args) throws Exception { MapReduceDriver objMapReduceDriver = new MapReduceDriver(); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(args[0]); FileStatus[] status = fs.listStatus(path); Path[] paths = FileUtil.stat2Paths(status); for (Path path2 : paths) { System.out.println(path2.toString()); } int res = ToolRunner.run(objMapReduceDriver, args); System.exit(res); }
Path path = new Path(args[0]); FileStatus[] status = fs.listStatus(path); Path[] paths = FileUtil.stat2Paths(status); for(Path path2 : paths) csvPaths = String.join(",", path2.toString()); FileInputFormat.setInputPaths(objJob, csvPaths);
Merging Files in a Folder
copyMerge – Parameters
- FileSystem Object
- Input Path
- FileSystem Object
- Output Path
- Delete Orginal File
- null
Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path inputPath = new Path(args[0]); Path outPath = new Path(args[2]); boolean Merge = FileUtil.copyMerge(fs, inputPath, fs, outPath, false, conf, null); if(Merge) System.out.println("Merge Successful");
globStatus takes patterns
Path path = new Path(args[0] + "/Inputs/Input*"); FileStatus[] status = fs.globStatus(path);
Merging Multiple Paths
import org.apache.commons.lang.StringUtils; csvPaths = StringUtils.join(paths,","); String[] arrcsvPaths = csvPaths.split(","); for (int i = 0; i < arrcsvPaths.length; i++) FileInputFormat.setInputPaths(objJob, arrcsvPaths[i]);
Passing Arguments in Command Context and Fetching It
String filterWords = context.getConfiguration().get("Word.Name"); for (int i = 0; i < arrString.length; i++) { if(filterWords.equals(arrString[i].toString())) context.write(new Text(arrString[i].toString()), new IntWritable(1)); }
Input
-DWord.Name=Tests /home/turbo/workspace/MapReduce5/src/Inputs/Inputs[1-2] /home/turbo/workspace/MapReduce5/src/Outputs/
Word.Name – is the parameter passed in Command Line.The Parameters should always passed as First Value.
The argument removes the parameter once the call to main method is over. So the args.length is 3 in main() and 2 in run method()