Listing Files in Directory
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
public class MapReduceDriver extends Configured implements Tool
{
public static void main(String[] args) throws Exception
{
MapReduceDriver objMapReduceDriver = new MapReduceDriver();
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path path = new Path(args[0]);
FileStatus[] status = fs.listStatus(path);
Path[] paths = FileUtil.stat2Paths(status);
for (Path path2 : paths)
{
System.out.println(path2.toString());
}
int res = ToolRunner.run(objMapReduceDriver, args);
System.exit(res);
}
Path path = new Path(args[0]);
FileStatus[] status = fs.listStatus(path);
Path[] paths = FileUtil.stat2Paths(status);
for(Path path2 : paths)
csvPaths = String.join(",", path2.toString());
FileInputFormat.setInputPaths(objJob, csvPaths);
Merging Files in a Folder
copyMerge – Parameters
- FileSystem Object
- Input Path
- FileSystem Object
- Output Path
- Delete Orginal File
- null
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
Path inputPath = new Path(args[0]);
Path outPath = new Path(args[2]);
boolean Merge = FileUtil.copyMerge(fs, inputPath, fs, outPath, false, conf, null);
if(Merge)
System.out.println("Merge Successful");
globStatus takes patterns
Path path = new Path(args[0] + "/Inputs/Input*"); FileStatus[] status = fs.globStatus(path);
Merging Multiple Paths
import org.apache.commons.lang.StringUtils;
csvPaths = StringUtils.join(paths,",");
String[] arrcsvPaths = csvPaths.split(",");
for (int i = 0; i < arrcsvPaths.length; i++)
FileInputFormat.setInputPaths(objJob, arrcsvPaths[i]);
Passing Arguments in Command Context and Fetching It
String filterWords = context.getConfiguration().get("Word.Name");
for (int i = 0; i < arrString.length; i++)
{
if(filterWords.equals(arrString[i].toString()))
context.write(new Text(arrString[i].toString()), new IntWritable(1));
}
Input
-DWord.Name=Tests /home/turbo/workspace/MapReduce5/src/Inputs/Inputs[1-2] /home/turbo/workspace/MapReduce5/src/Outputs/
Word.Name – is the parameter passed in Command Line.The Parameters should always passed as First Value.
The argument removes the parameter once the call to main method is over. So the args.length is 3 in main() and 2 in run method()