Listing Files in Directory

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;

public class MapReduceDriver extends Configured implements Tool
{
   public static void main(String[] args) throws Exception 
   {
	MapReduceDriver objMapReduceDriver = new MapReduceDriver();
		
	Configuration conf = new Configuration();
		
	FileSystem fs = FileSystem.get(conf);
	Path path = new Path(args[0]);
		
	FileStatus[] status = fs.listStatus(path);
	Path[] paths = FileUtil.stat2Paths(status);
		
	for (Path path2 : paths) 
        {
	  System.out.println(path2.toString());
	}
		
	int res = ToolRunner.run(objMapReduceDriver, args);
	System.exit(res);
   }
Path path = new Path(args[0]);
FileStatus[] status = fs.listStatus(path);
Path[] paths = FileUtil.stat2Paths(status);
		
for(Path path2 : paths) 
  csvPaths = String.join(",", path2.toString());

FileInputFormat.setInputPaths(objJob, csvPaths);

Merging Files in a Folder
copyMerge – Parameters

  1. FileSystem Object
  2. Input Path
  3. FileSystem Object
  4. Output Path
  5. Delete Orginal File
  6. null
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);

Path inputPath = new Path(args[0]);
Path outPath = new Path(args[2]);
		
boolean Merge = FileUtil.copyMerge(fs, inputPath, fs, outPath, false, conf, null);
		
if(Merge)
  System.out.println("Merge Successful");
		

globStatus takes patterns

Path path = new Path(args[0] + "/Inputs/Input*");
FileStatus[] status = fs.globStatus(path);

Merging Multiple Paths

 import org.apache.commons.lang.StringUtils;
 
 csvPaths = StringUtils.join(paths,",");
 String[] arrcsvPaths = csvPaths.split(",");

 for (int i = 0; i < arrcsvPaths.length; i++) 
  FileInputFormat.setInputPaths(objJob, arrcsvPaths[i]);	

Passing Arguments in Command Context and Fetching It

String filterWords =  context.getConfiguration().get("Word.Name");
				
for (int i = 0; i < arrString.length; i++) 
{	
  if(filterWords.equals(arrString[i].toString()))
    context.write(new Text(arrString[i].toString()), new IntWritable(1));
}

Input

 -DWord.Name=Tests /home/turbo/workspace/MapReduce5/src/Inputs/Inputs[1-2] /home/turbo/workspace/MapReduce5/src/Outputs/

Word.Name – is the parameter passed in Command Line.The Parameters should always passed as First Value.

The argument removes the parameter once the call to main method is over. So the args.length is 3 in main() and 2 in run method()

Comments are closed.