标签:3.1.1 关系 class 模拟 执行时间 start screen uart time
表名 | 行数 |
page_click_fact | 1亿 |
page_dim | 20万 |
referrer_dim | 100万 |
browser_dim | 2万 |
status_code | 70 |
date_dim | 366 |
create database test; use test; create table browser_dim( browser_sk bigint, browser_nm varchar(100), browser_version_no varchar(100), flash_version_no varchar(100), flash_enabled_flg int, java_version_no varchar(100), platform_desc string, java_enabled_flg int, java_script_enabled_flg int, cookies_enabled_flg int, user_language_cd varchar(100), screen_color_depth_no varchar(100), screen_size_txt string) row format delimited fields terminated by ‘,‘ stored as orc; create table date_dim( cal_dt date, day_in_cal_yr_no int, day_of_week_no int, start_of_month_dt date, start_of_quarter_dt date, start_of_week_dt date, start_of_year_dt date) row format delimited fields terminated by ‘,‘ stored as orc; create table page_dim( page_sk bigint, domain_nm varchar(200), reachability_cd string, page_desc string, protocol_nm varchar(20)) row format delimited fields terminated by ‘,‘ stored as orc; create table referrer_dim( referrer_sk bigint, referrer_txt string, referrer_domain_nm varchar(200)) row format delimited fields terminated by ‘,‘ stored as orc; create table status_code_dim( status_cd varchar(100), client_error_flg int, status_cd_desc string, server_error_flg int) row format delimited fields terminated by ‘,‘ stored as orc; create table page_click_fact( visitor_id varchar(100), detail_tm timestamp, page_click_dt date, page_sk bigint, client_session_dt date, previous_page_sk bigint, referrer_sk bigint, next_page_sk bigint, status_cd varchar(100), browser_sk bigint, bytes_received_cnt bigint, bytes_sent_cnt bigint, client_detail_tm timestamp, entry_point_flg int, exit_point_flg int, ip_address varchar(20), query_string_txt string, seconds_spent_on_page_cnt int, sequence_no int, requested_file_txt string) row format delimited fields terminated by ‘,‘ stored as orc;说明:hive表使用ORCfile存储格式。
2.2 M /apps/hive/warehouse/test.db/browser_dim 641 /apps/hive/warehouse/test.db/date_dim 4.1 G /apps/hive/warehouse/test.db/page_click_fact 16.1 M /apps/hive/warehouse/test.db/page_dim 22.0 M /apps/hive/warehouse/test.db/referrer_dim 1.1 K /apps/hive/warehouse/test.db/status_code_dim
analyze table date_dim compute statistics; analyze table browser_dim compute statistics; analyze table page_dim compute statistics; analyze table referrer_dim compute statistics; analyze table status_code_dim compute statistics; analyze table page_click_fact compute statistics;
create schema ext; set search_path=ext; create external table date_dim( cal_dt date, day_in_cal_yr_no int4, day_of_week_no int4, start_of_month_dt date, start_of_quarter_dt date, start_of_week_dt date, start_of_year_dt date ) location (‘pxf://hdp1:51200/test.date_dim?profile=hiveorc‘) format ‘custom‘ (formatter=‘pxfwritable_import‘); create external table browser_dim( browser_sk int8, browser_nm varchar(100), browser_version_no varchar(100), flash_version_no varchar(100), flash_enabled_flg int, java_version_no varchar(100), platform_desc text, java_enabled_flg int, java_script_enabled_flg int, cookies_enabled_flg int, user_language_cd varchar(100), screen_color_depth_no varchar(100), screen_size_txt text ) location (‘pxf://hdp1:51200/test.browser_dim?profile=hiveorc‘) format ‘custom‘ (formatter=‘pxfwritable_import‘); create external table page_dim( page_sk int8, domain_nm varchar(200), reachability_cd text, page_desc text, protocol_nm varchar(20) ) location (‘pxf://hdp1:51200/test.page_dim?profile=hiveorc‘) format ‘custom‘ (formatter=‘pxfwritable_import‘); create external table referrer_dim( referrer_sk int8, referrer_txt text, referrer_domain_nm varchar(200) ) location (‘pxf://hdp1:51200/test.referrer_dim?profile=hiveorc‘) format ‘custom‘ (formatter=‘pxfwritable_import‘); create external table status_code_dim( status_cd varchar(100), client_error_flg int4, status_cd_desc text, server_error_flg int4 ) location (‘pxf://hdp1:51200/test.status_code_dim?profile=hiveorc‘) format ‘custom‘ (formatter=‘pxfwritable_import‘); create external table page_click_fact( visitor_id varchar(100), detail_tm timestamp, page_click_dt date, page_sk int8, client_session_dt date, previous_page_sk int8, referrer_sk int8, next_page_sk int8, status_cd varchar(100), browser_sk int8, bytes_received_cnt int8, bytes_sent_cnt int8, client_detail_tm timestamp, entry_point_flg int4, exit_point_flg int4, ip_address varchar(20), query_string_txt text, seconds_spent_on_page_cnt int4, sequence_no int4, requested_file_txt text ) location (‘pxf://hdp1:51200/test.page_click_fact?profile=hiveorc‘) format ‘custom‘ (formatter=‘pxfwritable_import‘);说明:HAWQ外部表使用PXF协议,指向相应的hive表。
set search_path=public; create table date_dim( cal_dt date, day_in_cal_yr_no int4, day_of_week_no int4, start_of_month_dt date, start_of_quarter_dt date, start_of_week_dt date, start_of_year_dt date) with (compresstype=snappy,appendonly=true); create table browser_dim( browser_sk int8, browser_nm varchar(100), browser_version_no varchar(100), flash_version_no varchar(100), flash_enabled_flg int, java_version_no varchar(100), platform_desc text, java_enabled_flg int, java_script_enabled_flg int, cookies_enabled_flg int, user_language_cd varchar(100), screen_color_depth_no varchar(100), screen_size_txt text ) with (compresstype=snappy,appendonly=true); create table page_dim( page_sk int8, domain_nm varchar(200), reachability_cd text, page_desc text, protocol_nm varchar(20) ) with (compresstype=snappy,appendonly=true); create table referrer_dim( referrer_sk int8, referrer_txt text, referrer_domain_nm varchar(200) ) with (compresstype=snappy,appendonly=true); create table status_code_dim( status_cd varchar(100), client_error_flg int4, status_cd_desc text, server_error_flg int4 ) with (compresstype=snappy,appendonly=true); create table page_click_fact( visitor_id varchar(100), detail_tm timestamp, page_click_dt date, page_sk int8, client_session_dt date, previous_page_sk int8, referrer_sk int8, next_page_sk int8, status_cd varchar(100), browser_sk int8, bytes_received_cnt int8, bytes_sent_cnt int8, client_detail_tm timestamp, entry_point_flg int4, exit_point_flg int4, ip_address varchar(20), query_string_txt text, seconds_spent_on_page_cnt int4, sequence_no int4, requested_file_txt text ) with (compresstype=snappy,appendonly=true);说明:内部表结构定义与hive表等价,使用snappy压缩的行存储格式。
insert into date_dim select * from hcatalog.test.date_dim; insert into browser_dim select * from hcatalog.test.browser_dim; insert into page_dim select * from hcatalog.test.page_dim; insert into referrer_dim select * from hcatalog.test.referrer_dim; insert into status_code_dim select * from hcatalog.test.status_code_dim; insert into page_click_fact select * from hcatalog.test.page_click_fact;说明:通过HCatalog直接查询hive表,插入到HAWQ内部表中。snappy压缩后的各表对应的HDFS文件大小如下:
6.2 K /hawq_data/16385/177422/177677 3.3 M /hawq_data/16385/177422/177682 23.9 M /hawq_data/16385/177422/177687 39.3 M /hawq_data/16385/177422/177707 1.8 K /hawq_data/16385/177422/177726 7.9 G /hawq_data/16385/177422/177731
analyze date_dim; analyze browser_dim; analyze page_dim; analyze referrer_dim; analyze status_code_dim; analyze page_click_fact;
-- hive查询 select top_directory, count(*) as unique_visits from (select distinct visitor_id, substr(requested_file_txt,1,10) top_directory from page_click_fact, page_dim, browser_dim where domain_nm = ‘support.sas.com‘ and flash_enabled_flg=1 and weekofyear(detail_tm) = 19 and year(detail_tm) = 2017 ) directory_summary group by top_directory order by unique_visits; -- HAWQ查询,只是用extract函数代替了hive的weekofyear和year函数,与hive的查询语句等价。 select top_directory, count(*) as unique_visits from (select distinct visitor_id, substr(requested_file_txt,1,10) top_directory from page_click_fact, page_dim, browser_dim where domain_nm = ‘support.sas.com‘ and flash_enabled_flg=1 and extract(week from detail_tm) = 19 and extract(year from detail_tm) = 2017 ) directory_summary group by top_directory order by unique_visits;
-- hive查询 select domain_nm, requested_file_txt, count(*) as unique_visitors, month from (select distinct domain_nm, requested_file_txt, visitor_id, month(detail_tm) as month from page_click_fact, page_dim, referrer_dim where domain_nm = ‘support.sas.com‘ and referrer_domain_nm = ‘www.google.com‘ ) visits_pp_ph_summary group by domain_nm, requested_file_txt, month order by domain_nm, requested_file_txt, unique_visitors desc, month asc; -- HAWQ查询,只是用extract函数代替了hive的month函数,与hive的查询语句等价。 select domain_nm, requested_file_txt, count(*) as unique_visitors, month from (select distinct domain_nm, requested_file_txt, visitor_id, extract(month from detail_tm) as month from page_click_fact, page_dim, referrer_dim where domain_nm = ‘support.sas.com‘ and referrer_domain_nm = ‘www.google.com‘ ) visits_pp_ph_summary group by domain_nm, requested_file_txt, month order by domain_nm, requested_file_txt, unique_visitors desc, month asc;
-- hive查询 select query_string_txt, count(*) as count from page_click_fact, page_dim where query_string_txt <> ‘‘ and domain_nm=‘support.sas.com‘ and year(detail_tm) = ‘2017‘ group by query_string_txt order by count desc; -- HAWQ查询,只是用extract函数代替了hive的year函数,与hive的查询语句等价。 select query_string_txt, count(*) as count from page_click_fact, page_dim where query_string_txt <> ‘‘ and domain_nm=‘support.sas.com‘ and extract(year from detail_tm) = ‘2017‘ group by query_string_txt order by count desc;
-- hive查询 select domain_nm, requested_file_txt, count(*) as unique_visitors from (select distinct domain_nm, requested_file_txt, visitor_id from page_click_fact, page_dim, browser_dim where domain_nm=‘support.sas.com‘ and browser_nm like ‘%Safari%‘ and weekofyear(detail_tm) = 19 and year(detail_tm) = 2017 ) uv_summary group by domain_nm, requested_file_txt order by unique_visitors desc; -- HAWQ查询,只是用extract函数代替了hive的weekofyear和year函数,与hive的查询语句等价。 select domain_nm, requested_file_txt, count(*) as unique_visitors from (select distinct domain_nm, requested_file_txt, visitor_id from page_click_fact, page_dim, browser_dim where domain_nm=‘support.sas.com‘ and browser_nm like ‘%Safari%‘ and extract(week from detail_tm) = 19 and extract(year from detail_tm) = 2017 ) uv_summary group by domain_nm, requested_file_txt order by unique_visitors desc;
-- hive查询 select domain_nm, requested_file_txt, count(*) as unique_visits from (select distinct domain_nm, requested_file_txt, visitor_id from page_click_fact, page_dim where domain_nm=‘support.sas.com‘ and weekofyear(detail_tm) = 19 and year(detail_tm) = 2017 and seconds_spent_on_page_cnt > 10 ) visits_summary group by domain_nm, requested_file_txt order by unique_visits desc; -- HAWQ查询,只是用extract函数代替了hive的weekofyear和year函数,与hive的查询语句等价。 select domain_nm, requested_file_txt, count(*) as unique_visits from (select distinct domain_nm, requested_file_txt, visitor_id from page_click_fact, page_dim where domain_nm=‘support.sas.com‘ and extract(week from detail_tm) = 19 and extract(year from detail_tm) = 2017 and seconds_spent_on_page_cnt > 10 ) visits_summary group by domain_nm, requested_file_txt order by unique_visits desc;
查询 | Hive(秒) | HAWQ外部表(秒) | HAWQ内部表(秒) |
1 | 74.337 | 304.134 | 19.232 |
2 | 169.521 | 150.882 | 3.446 |
3 | 73.482 | 101.216 | 18.565 |
4 | 66.367 | 359.778 | 1.217 |
5 | 60.341 | 118.329 | 2.789 |
标签:3.1.1 关系 class 模拟 执行时间 start screen uart time
原文地址:http://blog.csdn.net/wzy0623/article/details/71479539