package com.mugil.pig;

import java.io.IOException;

import org.apache.pig.FilterFunc;
import org.apache.pig.data.Tuple;

public class FilterType  extends FilterFunc {
	
	@Override
	public Boolean exec(Tuple tuple) throws IOException {
		
		if(tuple == null || tuple.size() == 0)		
		 return false;
		
		try {
			Object obj = tuple.get(0);
			
			if(obj == null)			
			 return false;
			
			String Type = (String)obj;
			
			if(Type.equals("Kitchen"))
			 return true;
			
		} catch (Exception e) {
			throw new IOException("Caught exception processing input row " + e.getMessage(), e);
		}
			
		return false;
	}
}

Registering UDF Function

grunt> REGISTER  /usr/local/pig-0.15.0/FilterByType3.jar;                  
grunt> DEFINE FilterType com.mugil.pig.FilterType();         
grunt> filtered_records = FILTER records BY FilterType(Type);
grunt> DUMP filtered_records;
Posted in Pig.

Loading CSV File using Pig Script

A = LOAD '...' USING PigStorage(',') AS (...);

Filtering NULL values in chararray
Example1: null as chararray
input.txt

1,2014-04-08 12:09:23.0
2,2014-04-08 12:09:23.0
3,null
4,null

Pig:

A = LOAD 'input.txt' USING PigStorage(',') AS (f1:int,f2:chararray);
B = FILTER A BY f2!='null';
DUMP B;

Example2: Real null value
input.txt

1,2014-04-08 12:09:23.0
2,2014-04-08 12:09:23.0
3,
4,

Pig:

A = LOAD 'input.txt' USING PigStorage(',') AS (f1:int,f2:chararray);
B = FILTER A BY f2 is not null;
DUMP B;

Output:

(1,2014-04-08 12:09:23.0)
(2,2014-04-08 12:09:23.0)

Finding Max from CSV File

Test.csv

Maruthi,10
Maruthi,55
Suziki,50
Honda,4
Maruthi,40
Suziki,60
Honda,14
BMW,140
Benz,5
a1 = LOAD 'Test.csv' USING PigStorage(',') AS (Car:chararray, No:int);
DESCRIBE a1;

Output

 a1: {Car: chararray,No: int}
b1 = GROUP a1 BY Car;
DESCRIBE b1;
 b1: {group: chararray,a1: {(Car: chararray,No: int)}}
DUMP b1;

Output

(BMW,{(BMW,140)})
(Benz,{(Benz,5)})
(Honda,{(Honda,4),(Honda,14)})
(Suziki,{(Suziki,50),(Suziki,60)})
(Maruthi,{(Maruthi,10),(Maruthi,55),(Maruthi,40)})
(,{(,)})
 c1 = FOREACH b1 GENERATE group, MAX(a1.No);

Output

(BMW,140)
(Benz,5)
(Honda,14)
(Suziki,60)
(Maruthi,55)

Filtering Empty Records
Corrpted Record tsv content

HouseHold,Soap,2
Kitchen,Oil,2
HouseHold,Sweeper,2
PoojaItems,Sandal
Kitchen,Rice,30
HouseHold,,1
Kitchen,Sugar,5
HouseHold,Shampoo,2
PoojaItems,Champor,10
HouseHold,Soap,2
filtered_records = FILTER records BY Item is null OR No is null;

Getting Count of Corrupted Records

records = LOAD 'Test.csv' USING PigStorage(',') AS (Type:chararray, Item:chararray, No:int);
filtered_records = FILTER records BY Item is null OR No is null;
grouped_records = GROUP filtered_records BY Type;
DESCRIBE grouped_records;
 grouped_records: {group: chararray,filtered_records: {(Type: chararray,Item: chararray,No: int)}} 
corrupt_records = FOREACH grouped_records GENERATE group , COUNT(filtered_records);
(HouseHold,1)
(PoojaItems,1)

Writing macros to find Maximum Item Sold

DEFINE max_item_sold(Records, Type, No) RETURNS c 
{ 
 b = GROUP $Records BY $Type;                        
 $c = FOREACH b GENERATE group, MAX($Records.$No);  
}; 
 max_type_sold = max_item_sold(records, Type, No);
Posted in Pig.