`
genius_bai
  • 浏览: 80972 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

[Hadoop] Hive HQL

阅读更多

[Hadoop] Hive  HQL

 

http://wiki.apache.org/hadoop/Hive/GettingStarted

 

Hive  Shell 例子

设置Map和Reduce

set mapred.map.tasks=1;

SET mapred.reduce.tasks=1;

 

http://www.cxrs.org/SQL/10584.html

 

CREATE TABLE pokes (foo INT, bar STRING); 

/user/hive/warehouse/pokes

 

CREATE TABLE invites (foo INT, bar STRING) PARTITIONED BY (ds STRING);

 

SHOW TABLES;

 

SHOW TABLES '.*s';

 

ALTER TABLE pokes ADD COLUMNS (new_col INT);
ALTER TABLE invites ADD COLUMNS (new_col2 INT COMMENT 'a comment');
ALTER TABLE events RENAME TO 3koobecaf;

 

DROP TABLE pokes;

 

DML 操作

默认列之间,用"",new String(new byte[]{1})进行分隔。

一般表

LOAD DATA LOCAL INPATH './examples/files/kv1.txt' OVERWRITE INTO TABLE pokes;

 

分区表

LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');
LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-08');

 

LOAD DATA INPATH '/user/myname/kv2.txt' OVERWRITE INTO TABLE invites PARTITION (ds='2008-08-15');

 

SQL操作

设置MR参数

SET mapred.reduce.tasks=1;

查看当前所有参数

SET -v ;

 

查询

SELECT a.foo FROM invites a WHERE a.ds='2008-08-15';

 

INSERT OVERWRITE DIRECTORY '/tmp/hdfs_out' SELECT a.* FROM invites a WHERE a.ds='<DATE>';

 

INSERT OVERWRITE LOCAL DIRECTORY '/tmp/local_out' SELECT a.* FROM pokes a;

 

INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a;
INSERT OVERWRITE TABLE events SELECT a.* FROM profiles a WHERE a.key < 100;
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/reg_3' SELECT a.* FROM events a;
INSERT OVERWRITE DIRECTORY '/tmp/reg_4' select a.invites, a.pokes FROM profiles a;
INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT COUNT(1) FROM invites a WHERE a.ds='<DATE>';
INSERT OVERWRITE DIRECTORY '/tmp/reg_5' SELECT a.foo, a.bar FROM invites a;
INSERT OVERWRITE LOCAL DIRECTORY '/tmp/sum' SELECT SUM(a.pc) FROM pc1 a;

 

Sum of a column. avg, min, max can also be used
 

 

双Partition Table,可以按天,按小时分配数据

CREATE TABLE day_hour_ptable  (id INT, content STRING) PARTITIONED BY (ds STRING, hour string);

 

 

LOAD DATA LOCAL INPATH './examples/files/kv2.txt' OVERWRITE INTO TABLE day_hour_ptable PARTITION (ds='2008-04-01',hour='01');
LOAD DATA LOCAL INPATH './examples/files/kv3.txt' OVERWRITE INTO TABLE day_hour_ptable PARTITION (ds='2008-04-01',hour='02');

 

 

select * from day_hour_ptable a where a.ds='2008-04-01' and a.hour='01' and a.id=238;

 

生成1天,24小时的数据,启用下列语句的时候,Hadoop生成24个Map。

 

set mapred.map.tasks=1;

SET mapred.reduce.tasks=1;

select count(a.id) from day_hour_ptable a where a.ds='2008-04-01' and a.hour='01';

执行时间:23秒

 

SET mapred.reduce.tasks=24;

select count(a.id) from day_hour_ptable a;

执行时间:37秒 

创建用'\t'作为表的原始文件分隔符,取代''

CREATE TABLE u_data (
  userid INT,
  movieid INT,
  rating INT,
  unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;

 

创建分析Apache Web log的数据表

 

CREATE TABLE apachelog (
  host STRING,
  identity STRING,
  user STRING,
  time STRING,
  request STRING,
  status STRING,
  size STRING,
  referer STRING,
  agent STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
  "input.regex" = "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") (-|[0-9]*) (-|[0-9]*)(?: ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\"))?",
  "output.format.string" = "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s"
)
STORED AS TEXTFILE;

 

Hive QL例子

hive-0.3.99.1+0\examples

D:\7g\Personal\Resources\Architecture\Hadoop\hive-0.3.99.1+0\src\ql\src\test\queries\clientnegative

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics