Apache Hive - Practically

Apache Hive is an open source data warehouse system built on top of Hadoop it is used for querying and analyzing large datasets stored in Hadoop files.
Initially, you have to write complex Map-Reduce jobs, but now with the help of the Hive, you just need to submit merely SQL queries. Hive is mainly targeted towards users who are comfortable with SQL. Hive use language called HiveQL (HQL), which is similar to SQL. HiveQL automatically translates SQL-like queries into MapReduce jobs.
Hive abstracts the complexity of Hadoop. The main thing to notice is that there is no need to learn java for Hive.
The Hive generally runs on your workstation and converts your SQL query into a series of jobs for execution on a Hadoop cluster. Apache Hive organizes data into tables. This provides a means for attaching the structure to data stored in HDFS.
in Hive 0.14, new API’s have been added to completely fulfill the ACID properties while performing any transaction.

Transactions are provided at the row-level in Hive 0.14. The different row-level transactions available in Hive 0.14 are as follows:
Insert
Delete
Update
There are numerous limitations with the present transactions available in Hive 0.14. ORC is the file format supported by Hive transaction. It is now essential to have ORC file format for performing transactions in Hive. The table needs to be bucketed in order to support transactions.


Architecture :


Hive Architecture




Features of Apache Hive

There are so many features of Apache Hive. Let’s discuss them one by one-
  • Hive provides data summarization, query, and analysis in much easier manner.
  • Hive supports external tables which make it possible to process data without actually storing in HDFS.
  • Apache Hive fits the low-level interface requirement of Hadoop perfectly.
  • It also supports partitioning of data at the level of tables to improve performance.
  • Hive has a rule based optimizer for optimizing logical plans.
  • It is scalable, familiar, and extensible.
  • Using HiveQL doesn’t require any knowledge of programming language, Knowledge of basic SQL query is enough.
  • We can easily process structured data in Hadoop using Hive.
  • Querying in Hive is very simple as it is similar to SQL.
  • We can also run Ad-hoc queries for the data analysis using Hive.

 Limitation of Apache Hive

Hive has the following limitations-
  • Apache does not offer real-time queries and row level updates.
  • Hive also provides acceptable latency for interactive data browsing.
  • It is not good for online transaction processing.
  • Latency for Apache Hive queries is generally very high.


==================Practice================

Create Table using hive 
1.- start all daemons using following three steps
hadoop namenode -format
start-all.sh
jps

CREATE TABLE EMP
(
EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

serde

CREATE TABLE EMP
(
EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/EMP.csv'
INTO TABLE EMP;

set hive.cli.print.header=true;

!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp.csv;

LOAD DATA INPATH'/data/hive/emp.csv'
INTO TABLE EMP;

===========location clause==========

!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp.csv;
!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp2.csv;

CREATE TABLE EMP_loc
(
EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION'/data/hive';


LOAD DATA LOCAL INPATH'/home/itelligence/Dataset/CSV/EMP.csv' INTO TABLE EMP_LOC;
hadoop namenode -format

start-all.sh

jps


===========HIVE MANAGED TABLE==========



CREATE TABLE EMP
(
EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/EMP.csv'
INTO TABLE EMP;
(COPY)

set hive.cli.print.header=true;

TRUNCATE TABLE EMP;

!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv;

LOAD DATA INPATH'/data/hive/emp.csv'
INTO TABLE EMP;
(MOVED)

===========location clause==========

!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp.csv;
!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp2.csv;

CREATE TABLE EMP_loc
(
EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION'/data/hive';


LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/EMP.csv' INTO TABLE EMP_LOC;

=====================

FIND OUT DEPARTMENT_WISE AVERAGE SALARY

FIND OUT PEOPLE WHO HAVE FIRST NAME STARTING WITH A
FIND OUT DEPARTMENT_WISE AVERAGE SALARY

select DEPARTMENT_ID,AVG(SALARY)

FROM EMP
GROUP BY DEPARTMENT_ID;

FIND OUT PEOPLE WHO HAVE FIRST NAME STARTING WITH A


SELECT *

FROM EMP
WHERE FIRST_NAME LIKE 'A%';

FIND OUT PEOPLE WHO HAVE SALARY GREATER THAN AVERAGE SALARY


SELECT *

FROM EMP FULL OUTER JOIN (SELECT AVG(SALARY) AS A FROM EMP)W
WHERE SALARY > A;


SELECT *
FROM(
select orderid,orderamount AS A
from ord
where orderamount LIKE '$%'
UNION ALL
SELECT ORDERID,ORDERTYPE AS A
FROM ORD
WHERE ORDERTYPE LIKE '$%'
UNION ALL
SELECT ORDERID,PRODUCTDESCRIPTION AS A
FROM ORD
WHERE PRODUCTDESCRIPTION LIKE '$%')Q
WHERE SUBSTRING(A,2,LENGTH(A)-1) < 200;

CREATE TABLE T1
AS
select EXPLODE(SPLIT(word," ")) AS A
from wc;

select A,COUNT(*)
FROM T1
GROUP BY A;
CREATE TABLE T1
AS
select EXPLODE(SPLIT(word," ")) AS A
from wc;

select A,COUNT(*)
FROM T1
GROUP BY A;

insert overwrite directory '/DATA/H2' SELECT * FROM WC;

hive -f <hql_script_path>


========Partitioning and Bucketing ======

============DYNAMIC part===========
--STEPS--

STEP 1: CREATE EMP TABLE

STEP 2: LOAD DATA IN EMP TABLE

STEP 3: CREATE EMP PART TABLE(PARTITIONED)

STEP 4: EMP-->EMP_PART

==================================================

STEP 1: CREATE EMP TABLE

CREATE TABLE EMP
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';
-------------------
STEP 2: LOAD DATA IN EMP TABLE

LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/EMP.csv'
INTO TABLE EMP;

STEP 3: CREATE EMP PART TABLE(PARTITIONED)

CREATE TABLE EMP_PART
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT
)
PARTITIONED BY (DEPARTMENT_ID INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

STEP 4: EMP-->EMP_PART

INSERT INTO TABLE EMP_PART PARTITION(DEPARTMENT_ID)
SELECT * FROM EMP;

EMP(17)
EMP_PART(1)



set hive.exec.dynamic.partition.mode=nonstrict;

===========MULTIPLE COLUMNS============

CREATE TABLE EMP_PART2
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT
)
PARTITIONED BY (MANAGER_ID INT,DEPARTMENT_ID INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

INSERT OVERWRITE TABLE EMP_PART2 PARTITION(MANAGER_ID,DEPARTMENT_ID)
SELECT * FROM EMP;
INSERT INTO TABLE EMP_BUCKETED
select * from emp;
===========BUCKETING============
set hive.enforce.bucketing=true;

CREATE TABLE EMP_BUCKETED
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT
)
CLUSTERED BY (DEPARTMENT_ID) INTO 3 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

INSERT INTO TABLE EMP_BUCKETED
select * from emp;



CREATE TABLE EMP
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID  STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

load data local inpath'/home/nitin/Dataset/CSV/EMP.csv'
into table emp;
--------------------------------------------------------
CREATE TABLE EMP_PB
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID  STRING,SALARY INT,MANAGER_ID INT
)
PARTITIONED BY (DEPARTMENT_ID INT)
CLUSTERED BY (MANAGER_ID) INTO 3 BUCKETS
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

INSERT OVERWRITE TABLE EMP_PB PARTITION(DEPARTMENT_ID)
SELECT * FROM EMP;


=============FILE FORMAT=================

CREATE TABLE EMP_SQ
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID  STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT
)
STORED AS SEQUENCEFILE;

INSERT OVERWRITE TABLE EMP_SQ
SELECT * FROM EMP;
===================================

CREATE TABLE EMP_RC
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID  STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT
)
STORED AS RCFILE;

INSERT OVERWRITE TABLE EMP_RC
SELECT * FROM EMP;

===================================
EMP.csv --> EMP(TABLE) --> EMP_AVRO(TABLE)

1. ANALYSE SCHEMA OF EMP
2. CREATE YOUR SCHEMA FILE IN JSON FORMAT AND STORE IT ON HDFS(/data/emp_avro.json)
3. CREATE EMP TABLE AND LOAD EMP.csv in it
4. CREATE EMP_AVRO TABLE AND PASS SCHEMA FILE AS PARAMETER
5. LOAD DATA FROM EMP TABLE INTO EMP_AVRO TABLE

AVRO TABLE
(
AVRO SERDE CLASS PATH,
INPUT READER CLASS,
OUTPUT WRITER CLASS,
SCHEMA FILE
)
==================================================

{
 "namespace": "default",
 "type":"record",
 "name":"emp_avro",
 "fields": [
   {"name":"EMPLOYEE_ID","type":["int","null"]},
   {"name":"FIRST_NAME","type":"string"},
   {"name":"LAST_NAME","type":"string"},
   {"name":"EMAIL","type":"string"},
   {"name":"PHONE_NUMBER","type":"string"},
   {"name":"HIRE_DATE","type":"string"},
   {"name":"JOB_ID","type":"string"},
   {"name":"SALARY","type":["int","null"]},
   {"name":"MANAGER_ID","type":["int","null"]},
   {"name":"DEPARTMENT_ID","type":["int","null"]}
 ]
}

-------------------------------------------

CREATE TABLE emp_AVRO
 ROW FORMAT SERDE
 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
 STORED AS INPUTFORMAT
 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
 OUTPUTFORMAT
 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
 TBLPROPERTIES (
   'avro.schema.url'='hdfs://localhost:8020/data/emp_avro.json');
 
--------------------------------------------------
INSERT INTO TABLE EMP_AVRO
SELECT * FROM EMP;


==================UDFS IN HIVE======================
package myudfs;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
class ToUpper extends UDF {
 public Text evaluate(Text a)
{
   if(a == null) return null;
   return new Text(a.toString().toUpperCase());
 }
}
CREATE JAVA PROJ ---> ADD JARS(HADOOP,HADOOP LIB,HIVE LIB) ---> CREATE PACKAGE ---> CREATE CLASS ---> WRITE PROGRAM ---> EXPORT THE JAR FILE

ADD JAR /home/nitin/Desktop/hive_udf_36.jar;

CREATE TEMPORARY FUNCTION UP as 'myudfs.ToUpper';

select UP(name)
from emp;


========static partitioning=========
CREATE TABLE EMP_STATIC
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT
)
PARTITIONED BY (DEPARTMENT_ID INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
lines terminated by '\n';

LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/50.csv'
INTO TABLE EMP_STATIC PARTITION(DEPARTMENT_ID=50);

=======================tax cal==============
package myudfs;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
class TOUPPER extends UDF {
public Text evaluate(Text a)
{
int x;
x = Integer.parseInt(a.toString());
if(a == null)
return null;
double tax;
    if(x <= 250000)
    tax = 0;
    else
    if(x > 250000 && x <= 500000 )
    tax = 0 + (x-250000) * 0.1;
    else
    if(x > 500000 && x <= 1000000 )
    tax = 0 + 25000 + (x-500000) * 0.2;
    else
    tax = 0 + 25000 + 100000 + (x-1000000)*0.3;

 
   return new Text(tax + "");
 }
}
LOAD DATA LOCAL INPATH'/home/itelligence/Dataset/CSV/orders.csv'
INTO TABLE ORD;




No comments:

Post a Comment