Listing Files in Directory

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;

public class MapReduceDriver extends Configured implements Tool
   public static void main(String[] args) throws Exception 
	MapReduceDriver objMapReduceDriver = new MapReduceDriver();
	Configuration conf = new Configuration();
	FileSystem fs = FileSystem.get(conf);
	Path path = new Path(args[0]);
	FileStatus[] status = fs.listStatus(path);
	Path[] paths = FileUtil.stat2Paths(status);
	for (Path path2 : paths) 
	int res =, args);
Path path = new Path(args[0]);
FileStatus[] status = fs.listStatus(path);
Path[] paths = FileUtil.stat2Paths(status);
for(Path path2 : paths) 
  csvPaths = String.join(",", path2.toString());

FileInputFormat.setInputPaths(objJob, csvPaths);

Merging Files in a Folder
copyMerge – Parameters

  1. FileSystem Object
  2. Input Path
  3. FileSystem Object
  4. Output Path
  5. Delete Orginal File
  6. null
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);

Path inputPath = new Path(args[0]);
Path outPath = new Path(args[2]);
boolean Merge = FileUtil.copyMerge(fs, inputPath, fs, outPath, false, conf, null);
  System.out.println("Merge Successful");

globStatus takes patterns

Path path = new Path(args[0] + "/Inputs/Input*");
FileStatus[] status = fs.globStatus(path);

Merging Multiple Paths

 import org.apache.commons.lang.StringUtils;
 csvPaths = StringUtils.join(paths,",");
 String[] arrcsvPaths = csvPaths.split(",");

 for (int i = 0; i < arrcsvPaths.length; i++) 
  FileInputFormat.setInputPaths(objJob, arrcsvPaths[i]);	

Passing Arguments in Command Context and Fetching It

String filterWords =  context.getConfiguration().get("Word.Name");
for (int i = 0; i < arrString.length; i++) 
    context.write(new Text(arrString[i].toString()), new IntWritable(1));


 -DWord.Name=Tests /home/turbo/workspace/MapReduce5/src/Inputs/Inputs[1-2] /home/turbo/workspace/MapReduce5/src/Outputs/

Word.Name – is the parameter passed in Command Line.The Parameters should always passed as First Value.

The argument removes the parameter once the call to main method is over. So the args.length is 3 in main() and 2 in run method()

Comments are closed.