Hive Architecture
Features of Apache Hive
There are so many features of Apache Hive. Let’s discuss them one by one-
- Hive provides data summarization, query, and analysis in much easier manner.
- Hive supports external tables which make it possible to process data without actually storing in HDFS.
- Apache Hive fits the low-level interface requirement of Hadoop perfectly.
- It also supports partitioning of data at the level of tables to improve performance.
- Hive has a rule based optimizer for optimizing logical plans.
- It is scalable, familiar, and extensible.
- Using HiveQL doesn’t require any knowledge of programming language, Knowledge of basic SQL query is enough.
- We can easily process structured data in Hadoop using Hive.
- Querying in Hive is very simple as it is similar to SQL.
- We can also run Ad-hoc queries for the data analysis using Hive.
Limitation of Apache Hive
Hive has the following limitations-
- Apache does not offer real-time queries and row level updates.
- Hive also provides acceptable latency for interactive data browsing.
- It is not good for online transaction processing.
- Latency for Apache Hive queries is generally very high.
==================Practice================
Create Table using hive
1.- start all daemons using following three steps
hadoop namenode -format
start-all.sh
jps
CREATE TABLE EMP
(
EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';
serde
CREATE TABLE EMP ( EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/EMP.csv' INTO TABLE EMP;
set hive.cli.print.header=true;
!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp.csv;
LOAD DATA INPATH'/data/hive/emp.csv' INTO TABLE EMP;
===========location clause==========
!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp.csv; !hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp2.csv;
CREATE TABLE EMP_loc ( EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION'/data/hive';
LOAD DATA LOCAL INPATH'/home/itelligence/Dataset/CSV/EMP.csv' INTO TABLE EMP_LOC;
hadoop namenode -format
start-all.sh
jps
===========HIVE MANAGED TABLE==========
CREATE TABLE EMP ( EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/EMP.csv' INTO TABLE EMP; (COPY)
set hive.cli.print.header=true;
TRUNCATE TABLE EMP;
!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv;
LOAD DATA INPATH'/data/hive/emp.csv' INTO TABLE EMP; (MOVED)
===========location clause==========
!hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp.csv; !hadoop fs -put /home/nitin/Dataset/CSV/EMP.csv /data/hive/emp2.csv;
CREATE TABLE EMP_loc ( EMPLOYEE_ID int,FIRST_NAME string,LAST_NAME string,EMAIL string,PHONE_NUMBER string,HIRE_DATE string,JOB_ID string,SALARY int,MANAGER_ID int,DEPARTMENT_ID int ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' LOCATION'/data/hive';
LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/EMP.csv' INTO TABLE EMP_LOC;
=====================
FIND OUT DEPARTMENT_WISE AVERAGE SALARY
FIND OUT PEOPLE WHO HAVE FIRST NAME STARTING WITH A
FIND OUT DEPARTMENT_WISE AVERAGE SALARY
select DEPARTMENT_ID,AVG(SALARY)
FROM EMP
GROUP BY DEPARTMENT_ID;
FIND OUT PEOPLE WHO HAVE FIRST NAME STARTING WITH A
SELECT *
FROM EMP
WHERE FIRST_NAME LIKE 'A%';
FIND OUT PEOPLE WHO HAVE SALARY GREATER THAN AVERAGE SALARY
SELECT *
FROM EMP FULL OUTER JOIN (SELECT AVG(SALARY) AS A FROM EMP)W
WHERE SALARY > A;
SELECT *
FROM(
select orderid,orderamount AS A
from ord
where orderamount LIKE '$%'
UNION ALL
SELECT ORDERID,ORDERTYPE AS A
FROM ORD
WHERE ORDERTYPE LIKE '$%'
UNION ALL
SELECT ORDERID,PRODUCTDESCRIPTION AS A
FROM ORD
WHERE PRODUCTDESCRIPTION LIKE '$%')Q
WHERE SUBSTRING(A,2,LENGTH(A)-1) < 200;
CREATE TABLE T1 AS select EXPLODE(SPLIT(word," ")) AS A from wc;
select A,COUNT(*) FROM T1 GROUP BY A;
CREATE TABLE T1 AS select EXPLODE(SPLIT(word," ")) AS A from wc;
select A,COUNT(*) FROM T1 GROUP BY A;
insert overwrite directory '/DATA/H2' SELECT * FROM WC;
hive -f <hql_script_path>
========Partitioning and Bucketing ======
============DYNAMIC part=========== --STEPS--
STEP 1: CREATE EMP TABLE
STEP 2: LOAD DATA IN EMP TABLE
STEP 3: CREATE EMP PART TABLE(PARTITIONED)
STEP 4: EMP-->EMP_PART
==================================================
STEP 1: CREATE EMP TABLE
CREATE TABLE EMP ( EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'; ------------------- STEP 2: LOAD DATA IN EMP TABLE
LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/EMP.csv' INTO TABLE EMP;
STEP 3: CREATE EMP PART TABLE(PARTITIONED)
CREATE TABLE EMP_PART ( EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT ) PARTITIONED BY (DEPARTMENT_ID INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
STEP 4: EMP-->EMP_PART
INSERT INTO TABLE EMP_PART PARTITION(DEPARTMENT_ID) SELECT * FROM EMP;
EMP(17) EMP_PART(1)
set hive.exec.dynamic.partition.mode=nonstrict;
===========MULTIPLE COLUMNS============
CREATE TABLE EMP_PART2 ( EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT ) PARTITIONED BY (MANAGER_ID INT,DEPARTMENT_ID INT) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
INSERT OVERWRITE TABLE EMP_PART2 PARTITION(MANAGER_ID,DEPARTMENT_ID) SELECT * FROM EMP;
INSERT INTO TABLE EMP_BUCKETED select * from emp;
===========BUCKETING============ set hive.enforce.bucketing=true;
CREATE TABLE EMP_BUCKETED ( EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT ) CLUSTERED BY (DEPARTMENT_ID) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
INSERT INTO TABLE EMP_BUCKETED select * from emp;
CREATE TABLE EMP ( EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
load data local inpath'/home/nitin/Dataset/CSV/EMP.csv' into table emp; -------------------------------------------------------- CREATE TABLE EMP_PB ( EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT ) PARTITIONED BY (DEPARTMENT_ID INT) CLUSTERED BY (MANAGER_ID) INTO 3 BUCKETS ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n';
INSERT OVERWRITE TABLE EMP_PB PARTITION(DEPARTMENT_ID) SELECT * FROM EMP;
=============FILE FORMAT=================
CREATE TABLE EMP_SQ ( EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT ) STORED AS SEQUENCEFILE;
INSERT OVERWRITE TABLE EMP_SQ SELECT * FROM EMP; ===================================
CREATE TABLE EMP_RC ( EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT,DEPARTMENT_ID INT ) STORED AS RCFILE;
INSERT OVERWRITE TABLE EMP_RC SELECT * FROM EMP;
===================================
EMP.csv --> EMP(TABLE) --> EMP_AVRO(TABLE)
1. ANALYSE SCHEMA OF EMP
2. CREATE YOUR SCHEMA FILE IN JSON FORMAT AND STORE IT ON HDFS(/data/emp_avro.json)
3. CREATE EMP TABLE AND LOAD EMP.csv in it
4. CREATE EMP_AVRO TABLE AND PASS SCHEMA FILE AS PARAMETER
5. LOAD DATA FROM EMP TABLE INTO EMP_AVRO TABLE
AVRO TABLE
(
AVRO SERDE CLASS PATH,
INPUT READER CLASS,
OUTPUT WRITER CLASS,
SCHEMA FILE
)
==================================================
{
"namespace": "default",
"type":"record",
"name":"emp_avro",
"fields": [
{"name":"EMPLOYEE_ID","type":["int","null"]},
{"name":"FIRST_NAME","type":"string"},
{"name":"LAST_NAME","type":"string"},
{"name":"EMAIL","type":"string"},
{"name":"PHONE_NUMBER","type":"string"},
{"name":"HIRE_DATE","type":"string"},
{"name":"JOB_ID","type":"string"},
{"name":"SALARY","type":["int","null"]},
{"name":"MANAGER_ID","type":["int","null"]},
{"name":"DEPARTMENT_ID","type":["int","null"]}
]
}
-------------------------------------------
CREATE TABLE emp_AVRO
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES (
'avro.schema.url'='hdfs://localhost:8020/data/emp_avro.json');
--------------------------------------------------
INSERT INTO TABLE EMP_AVRO
SELECT * FROM EMP;
==================UDFS IN HIVE======================
package myudfs;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
class ToUpper extends UDF {
public Text evaluate(Text a)
{
if(a == null) return null;
return new Text(a.toString().toUpperCase());
}
}
CREATE JAVA PROJ ---> ADD JARS(HADOOP,HADOOP LIB,HIVE LIB) ---> CREATE PACKAGE ---> CREATE CLASS ---> WRITE PROGRAM ---> EXPORT THE JAR FILE
ADD JAR /home/nitin/Desktop/hive_udf_36.jar;
CREATE TEMPORARY FUNCTION UP as 'myudfs.ToUpper';
select UP(name)
from emp;
========static partitioning=========
CREATE TABLE EMP_STATIC
(
EMPLOYEE_ID INT,FIRST_NAME STRING,LAST_NAME STRING,EMAIL STRING,PHONE_NUMBER STRING,HIRE_DATE STRING,JOB_ID STRING,SALARY INT,MANAGER_ID INT
)
PARTITIONED BY (DEPARTMENT_ID INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
lines terminated by '\n';
LOAD DATA LOCAL INPATH'/home/nitin/Dataset/CSV/50.csv'
INTO TABLE EMP_STATIC PARTITION(DEPARTMENT_ID=50);
=======================tax cal==============
package myudfs; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; class TOUPPER extends UDF { public Text evaluate(Text a) { int x; x = Integer.parseInt(a.toString()); if(a == null) return null; double tax; if(x <= 250000) tax = 0; else if(x > 250000 && x <= 500000 ) tax = 0 + (x-250000) * 0.1; else if(x > 500000 && x <= 1000000 ) tax = 0 + 25000 + (x-500000) * 0.2; else tax = 0 + 25000 + 100000 + (x-1000000)*0.3;
return new Text(tax + ""); } }
LOAD DATA LOCAL INPATH'/home/itelligence/Dataset/CSV/orders.csv' INTO TABLE ORD;
|
No comments:
Post a Comment