查询响应
hive从3.0版本开始移除了索引。由于客户端抽样机制,超过五十行的,oracle要在查询语句外层添加select count(*) from (query语句),与query语句对比,获取到准确查询时间。详见附录关于抽样的说明。
查询时,oracle客户端是sqldeveloper,hive客户端是dbvear。
单表
不限制字段数量,使用 * 查询所有字段
等值查询
单条查询 code或者pk
oracle
select * from t_od_bd_stordoc WHERE code='1020002';
SELECT * FROM t_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM t_od_bd_material WHERE code = '302550991000019';
SELECT * FROM t_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
SELECT * FROM t_od_ic_flow_10 WHERE pk_flow = '1001A11000000003KBHH';
SELECT * FROM t_od_ic_flow_100 WHERE pk_flow = '1001A11000000003KBHH';
create unique index uninx_pk_flow on t_od_ic_flow (pk_flow);
create index inx_pk_flow_10 on t_od_ic_flow_10 (pk_flow);
create index inx_pk_flow_100 on t_od_ic_flow_100 (pk_flow);
drop index uninx_pk_flow;
drop index inx_pk_flow_10;
drop index inx_pk_flow_100;
hive
select * from ht_od_bd_stordoc WHERE code='1020002';
SELECT * FROM ht_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM ht_od_bd_material WHERE code = '302550991000019';
SELECT * FROM ht_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_BD_STORDOC | 行1列45 | 3s | 0.012s | ||
XXX_ORG_ORGS | 行1列130 | 2s | 0.015s | ||
XXX_BD_MATERIAL | 行1列86 | 6s | 0.037s | ||
XXX_IC_FLOW | 行1列166 | 8s | 0.534s | 0.043s | |
XXX_IC_FLOW_10 | 行10列166 | 11.261s | 0.047s | ||
XXX_IC_FLOW_100 | 行100列166 | 53.61s | 0.044s |
结论:当前硬件配置下,表行数在百万以内,oracle即使没有索引,响应速度依然能够很快。百万、千万记录的表,走索引的查询和不走索引的查询,响应速度差别巨大。
hive行数上万后,响应速度过长。
模糊查询
少量查询 code模糊查询
oracle
select * from t_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM t_od_org_orgs WHERE code like '100%'
SELECT * FROM t_od_bd_material WHERE code like '30255099%'
SELECT * FROM t_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT * FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT * FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
CREATE INDEX inx_vbillcode_10 ON T_OD_IC_FLOW_10 (vbillcode );
CREATE INDEX inx_vbillcode_100 ON T_OD_IC_FLOW_100 (vbillcode );
drop index inx_vbillcode;
drop index inx_vbillcode_10;
drop index inx_vbillcode_100;
drop index inx_code;
hive
select * from ht_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM ht_od_org_orgs WHERE code like '100%'
SELECT * FROM ht_od_bd_material WHERE code like '30255099%'
SELECT * FROM ht_od_ic_flow WHERE vbillcode like 'CR202308%'
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_BD_STORDOC | 行3列45 | 4s | 0.005s | ||
XXX_ORG_ORGS | 行13列130 | 8s | 0.024s | ||
XXX_BD_MATERIAL | 行191列86 | 10s | 0.120s | 0.064s | |
XXX_IC_FLOW | 行1834列166 | 4s | 0.682s | 0.147s | |
XXX_IC_FLOW_10 | 行18340列166 | 10.754s | 0.029s | ||
XXX_IC_FLOW_100 | 行183400列166 | 102.334s | 0.226s |
结论:类似等值查询。
范围查询
少量查询 code范围查询
少量查询 code范围查询
oracle
select * from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT * FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT * FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT * FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT * FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT * FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
hive
select * from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT * FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT * FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT * FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_BD_STORDOC | 行6列45 | 4s | 0.009s | ||
XXX_ORG_ORGS | 行69列130 | 6s | 0.021s | ||
XXX_BD_MATERIAL | 行999列86 | 13.115s | 0.193s | 0.022s | |
XXX_IC_FLOW | 行1668列166 | 6s | 0.625s | 0.232s | |
XXX_IC_FLOW_10 | 行16680列166 | 10.065s | 0.148s/0.011s | ||
XXX_IC_FLOW_100 | 行166800列166 | 90.398s | 0.145s/0.045s |
单表2
根据分析类查询的特点,一般仪表板select 3-5个字段足矣(多维查询则需要更多字段),所以这里统计查询3-5个字段的情况。
等值查询
单条查询 code或者pk
oracle
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code='1020002';
SELECT pk_org,code,name FROM t_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT pk_material,code,name FROM t_od_bd_material WHERE code = '302550991000019';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE pk_flow = '1001A11000000003KBHH';
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE pk_flow = '1001A11000000003KBHH';
create unique index uninx_pk_flow on t_od_ic_flow (pk_flow);
create index inx_pk_flow_10 on t_od_ic_flow_10 (pk_flow);
create index inx_pk_flow_100 on t_od_ic_flow_100 (pk_flow);
drop index uninx_pk_flow;
drop index inx_pk_flow_10;
drop index inx_pk_flow_100;
hive
select pk_org,code,name from ht_od_bd_stordoc WHERE code='1020002';
SELECT * FROM ht_od_org_orgs WHERE pk_org = '1001A1100000000LX48O';
SELECT * FROM ht_od_bd_material WHERE code = '302550991000019';
SELECT * FROM ht_od_ic_flow WHERE pk_flow = '1001A11000000003KBHH';
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_BD_STORDOC | 行1列45 | 0.982s/0.264s | 0.117s | ||
XXX_ORG_ORGS | 行1列130 | 1s/0.272s | 0.032s | ||
XXX_BD_MATERIAL | 行1列86 | 1s/0.855s | 0.337s | 0.02s | |
XXX_IC_FLOW | 行1列166 | 7s/6s | 2.816s | 0.035s | |
XXX_IC_FLOW_10 | 行10列166 | 17.308s | 0.024s | ||
XXX_IC_FLOW_100 | 行100列166 | 108.944s | 0.042s |
结论:当前硬件配置下,表行数在百万以内,oracle即使没有索引,响应速度依然能够很快。百万、千万记录的表,走索引的查询和不走索引的查询,响应速度差别巨大。
hive行数上万后,响应速度过长。
模糊查询
少量查询 code模糊查询
oracle
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code LIKE '10%'
SELECT pk_org,code,name FROM t_od_org_orgs WHERE code like '100%'
SELECT pk_material,code,name FROM t_od_bd_material WHERE code like '30255099%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
select count(*) cc from (SELECT * FROM t_od_ic_flow_10 WHERE vbillcode like 'CR202308%')
select count(*) cc from (SELECT * FROM t_od_ic_flow_100 WHERE vbillcode like 'CR202308%')
hive
select * from ht_od_bd_stordoc WHERE code LIKE '10%'
SELECT * FROM ht_od_org_orgs WHERE code like '100%'
SELECT * FROM ht_od_bd_material WHERE code like '30255099%'
SELECT * FROM ht_od_ic_flow WHERE vbillcode like 'CR202308%'
SELECT * FROM ht_od_ic_flow_10 WHERE vbillcode like 'CR202308%'
SELECT * FROM ht_od_ic_flow_100 WHERE vbillcode like 'CR202308%'
select count(*) cc from (SELECT * FROM ht_od_ic_flow_10 WHERE vbillcode like 'CR202308%')
select count(*) cc from (SELECT * FROM ht_od_ic_flow_100 WHERE vbillcode like 'CR202308%')
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_BD_STORDOC | 行3列45 | 0.016s | |||
XXX_ORG_ORGS | 行13列130 | 0.022s | |||
XXX_BD_MATERIAL | 行191列86 | 0.13s | 0.018s | ||
XXX_IC_FLOW | 行1834列166 | 0.901s | 0.031s | ||
XXX_IC_FLOW_10 | 行18340列166 | 11.723s | 0.049s | ||
XXX_IC_FLOW_100 | 行183400列166 | 117.365s | 0.29s |
结论:类似等值查询。
范围查询
少量查询 code范围查询
oracle
drop index inx_code
drop index inx_vbillcode
select pk_stordoc,code,name from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT pk_org,code,name FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT pk_material,code,name FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_code ON T_OD_BD_MATERIAL (CODE);
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
select pk_stordoc,code,name from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT pk_org,code,name FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT pk_material,code,name FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT cmaterialoid,cwarehouseid,dbilldate,ninnum,ncostmny FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_BD_STORDOC | 行6列45 | 1s/0.139s | 0.043s | ||
XXX_ORG_ORGS | 行69列130 | 1s/0.343s | 0.039s | ||
XXX_BD_MATERIAL | 行999列86 | 1/0.502s | 0.091s | 0.031s | |
XXX_IC_FLOW | 行1668列166 | 4s/3s | 0.907s | 0.031s | |
XXX_IC_FLOW_10 | 行16680列166 | 9.196s | 0.029s | ||
XXX_IC_FLOW_100 | 行166800列166 | 91.147s | 0.08s |
聚合查询
oracle
drop index inx_code
drop index inx_vbillcode
select count(*) from t_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT count(*) FROM t_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT count(*) FROM t_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
SELECT sum(ncostmny),sum(ninnum) FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
select count(*) from ht_od_bd_stordoc WHERE code >= 'DK01' AND code <= 'DK11'
SELECT count(*) FROM ht_od_org_orgs WHERE code >= '10100' AND code <= '10200'
SELECT count(*) FROM ht_od_bd_material WHERE code >= '201515201000003' AND code <= '201515901000003'
SELECT sum(ncostmny),sum(ninnum) FROM ht_od_ic_flow WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_BD_STORDOC | 行1列1 | 0.038s | |||
XXX_ORG_ORGS | 行1列1 | 0.024s | |||
XXX_BD_MATERIAL | 行1列1 | 0.118s | 0.032s | ||
XXX_IC_FLOW | 行1列1 | 0.401s | 0.059s | ||
XXX_IC_FLOW_10 | 行1列1 | 9.371s | 0.454s | ||
XXX_IC_FLOW_100 | 行1列1 | 91.001s | 58.89s |
结论:查询ht_od_ic_flow时,有时会因内存崩溃而报错,详见hive聚合查询异常。oracle数据量达到千万级后,IO读写成为了瓶颈,所以即使加索引,耗时依然很久,甚至会出现超过不加索引时的耗时。同一会话多次执行同一sql,会有缓存,耗时会极大降低,即使千万级会降低至1.62s,即使清缓存,同样的sql语句依然是1.xxxs至3.xxxs,应该还有其他缓存机制。将SELECT sum(ncostmny),sum(ninnum) FROM ht_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001' AND vbillcode <= 'CR2023083000000001'中的vbillcode起始改为CR202308xx00000001,速度依然很快,往前调个两三天0731、0730、0729,速度依然很快,维持在1.xxxs,或者至多不超过10s。
分析查询
oracle
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow_10 WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
SELECT pk_org,cmaterialoid,pk_flow,ninnum
,sum(ninnum) OVER (PARTITION BY cmaterialoid )
,CASE WHEN sum(ninnum) OVER (PARTITION BY cmaterialoid )!=0
THEN ninnum / sum(ninnum) OVER (PARTITION BY cmaterialoid )
END pp
FROM t_od_ic_flow_100 WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
hive
SELECT PK_ORG,CMATERIALOID,PK_FLOW,NINNUM,SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )
,CASE WHEN SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )!=0
THEN NINNUM / SUM(NINNUM) OVER (PARTITION BY CMATERIALOID )
END pp
FROM ht_od_ic_flow WHERE VBILLCODE >= 'CR2023080100000001'
AND VBILLCODE <= 'CR2023083000000001'
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_IC_FLOW | 行1668列6 | 0.375s | 0.261s | ||
XXX_IC_FLOW_10 | 行16680列6 | 9.016s | 6.53s | ||
XXX_IC_FLOW_100 | 行166800列6 | 91.337s | 66.474s |
多表
多表连接查询
分析查询
oracle
drop index inx_vbillcode
SELECT t1.pk_org,t2.code orgcode,t2.name orgname
,t1.cmaterialoid,t3.code materialcode,t3.name materialname
,t1.pk_flow,t1.ninnum
,sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
,CASE WHEN sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )!=0
THEN t1.ninnum / sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
END pp
FROM t_od_ic_flow t1
inner JOIN t_od_org_orgs t2 ON t1.pk_org = t2.pk_org
inner JOIN t_od_bd_material t3 ON t1.cmaterialoid = t3.pk_material
WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
CREATE INDEX inx_vbillcode ON T_OD_IC_FLOW (vbillcode );
CREATE INDEX inx_vbillcode_10 ON T_OD_IC_FLOW_10 (vbillcode );
CREATE INDEX inx_vbillcode_100 ON T_OD_IC_FLOW_100 (vbillcode );
hive
SELECT t1.pk_org,t2.code orgcode,t2.name orgname
,t1.cmaterialoid,t3.code materialcode,t3.name materialname
,t1.pk_flow,t1.ninnum
,sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
,CASE WHEN sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )!=0
THEN t1.ninnum / sum(t1.ninnum) OVER (PARTITION BY t1.cmaterialoid )
END pp
FROM ht_od_ic_flow t1
inner JOIN ht_od_org_orgs t2 ON t1.pk_org = t2.pk_org
inner JOIN ht_od_bd_material t3 ON t1.cmaterialoid = t3.pk_material
WHERE vbillcode >= 'CR2023080100000001'
AND vbillcode <= 'CR2023083000000001'
表名 | 返回量 | 无索引 | 用索引 | ||
hive | oracle | hive | oracle | ||
XXX_IC_FLOW | 行1668列10 | 0.555s | 0.266s | ||
XXX_IC_FLOW_10 | 行16680列10 | 9.358s | 0.338s | ||
XXX_IC_FLOW_100 | 行166800列10 | 92.048s | 92.019s |
orcle数据达到千万级时,有无索引,响应时间几乎没有差别,此时瓶颈应该在硬件配置上。