|
|
|
|
|
|
|
# 配置信息:
|
|
|
|
|
|
|
|
## 使用gdpi.sada_gdpi_click库时必须加上这段权限
|
|
|
|
|
|
|
|
set /user/gdpi/public/sada_gdpi_click.password=GWDPI-SH;
|
|
|
|
|
|
|
|
## 解决分片数过多问题
|
|
|
|
|
|
|
|
set mapreduce.job.max.split.locations = 10000;
|
|
|
|
|
|
|
|
## 配置reduce数量
|
|
|
|
|
|
|
|
SET mapred.reduce.tasks = 80;
|
|
|
|
|
|
|
|
## 配置使用内存
|
|
|
|
|
|
|
|
SET mapreduce.map.memory.mb=3048;
|
|
|
|
SET mapreduce.reduce.memory.mb=7000;
|
|
|
|
|
|
|
|
## 配置Java虚拟机内存
|
|
|
|
|
|
|
|
SET mapreduce.map.java.opts='-Xmx6096m';
|
|
|
|
SET mapreduce.reduce.java.opts='-Xmx10000m';
|
|
|
|
|
|
|
|
## 配置mapjoin
|
|
|
|
|
|
|
|
SET hive.auto.convert.join=false;
|
|
|
|
SET hive.ignore.mapjoin.hint=false;
|
|
|
|
|
|
|
|
## 配置是否使用压缩
|
|
|
|
|
|
|
|
SET hive.exec.compress.output=true;
|
|
|
|
|
|
|
|
## 配置压缩格式(参考HIVE配置)
|
|
|
|
|
|
|
|
SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec;
|
|
|
|
|
|
|
|
## 加载自定义UDF
|
|
|
|
|
|
|
|
ADD JAR ${udf_path};
|
|
|
|
|
|
|
|
## 配置自定义UDF函数(参考自定义UDF库)
|
|
|
|
|
|
|
|
CREATE TEMPORARY FUNCTION my_dev AS 'udf.tag.ua.DeviceTypeUDF';
|
|
|
|
|
|
|
|
hive内置的udf参见:<https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF>
|
|
|
|
|
|
|
|
# 基本操作命令:
|
|
|
|
|
|
|
|
## 进入hive环境
|
|
|
|
|
|
|
|
hive;
|
|
|
|
|
|
|
|
## 创建数据仓库
|
|
|
|
|
|
|
|
create database e_pingan WITH DBPROPERTIES ('creator' = 'yangbl', 'date' = '20150430');
|
|
|
|
|
|
|
|
## 查看数据仓库
|
|
|
|
|
|
|
|
SHOW DATABASES;
|
|
|
|
|
|
|
|
## 进入数据库
|
|
|
|
|
|
|
|
use e_pingan;
|
|
|
|
|
|
|
|
## 查看数据表
|
|
|
|
|
|
|
|
show tables;
|
|
|
|
|
|
|
|
## 生成可用数据表
|
|
|
|
|
|
|
|
add jar /home/xuji/yang/hive/udf.jar;
|
|
|
|
|
|
|
|
create temporary function my_decode as 'udf.FullDecodeUDF';
|
|
|
|
create temporary function my_dev as 'udf.DeviceUDF';
|
|
|
|
|
|
|
|
create table zhangwj.touna_used
|
|
|
|
row format delimited fields terminated by '\t'
|
|
|
|
location '/user/xuji/yang/test/touna'
|
|
|
|
as
|
|
|
|
|
|
|
|
select srcip,datelabel,loadstamp,my_decode(url) as url,my_decode(cast(unbase64(ref) as string)) as ref,
|
|
|
|
case
|
|
|
|
when cast(unbase64(ref) as string) like '%.baidu.%/%wd=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'wd'))
|
|
|
|
when cast(unbase64(ref) as string) like '%.baidu.%/%word=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'word'))
|
|
|
|
when cast(unbase64(ref) as string) like '%.baidu.%/%kw=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'kw'))
|
|
|
|
when cast(unbase64(ref) as string) like '%.haosou.%/%q=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'q'))
|
|
|
|
when cast(unbase64(ref) as string) like '%.sogou.%/%query=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'query'))
|
|
|
|
when cast(unbase64(ref) as string) like '%.bing.%/%q=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'q'))
|
|
|
|
when cast(unbase64(ref) as string) like '%.google.%/%q=%' then my_decode(parse_url(cast(unbase64(ref) as string), 'QUERY', 'q'))
|
|
|
|
else null end as kw,
|
|
|
|
my_dev(cast(unbase64(ua) as string)) as ua
|
|
|
|
from touna;
|
|
|
|
|
|
|
|
## 查询结果保存到本地
|
|
|
|
|
|
|
|
insert overwrite local directory '/home/xuji/zhangwj/task' row format delimited fields terminated by '\t'
|
|
|
|
select search_keyword,sum(pv) as kpv from yangbl.pingan_data_kw_20150302 group by search_keyword order by kpv desc limit 20000;
|
|
|
|
|
|
|
|
# hive脚本的使用
|
|
|
|
|
|
|
|
参见具体的脚本
|
|
|
|
|
|
|
|
# shell脚本的使用
|
|
|
|
|
|
|
|
参见具体脚本
|
|
|
|
|
|
|
|
# mr程序的编写和运行
|
|
|
|
|
|
|
|
参见具体程序 |