一、目的
在完成数据清洗、错误数据之后,需要根据修复规则对错误数据进行修复
二、Hive中原有代码
insert into table hurys_db.dwd_queue partition(day) selecta3.id,a3.device_no,a3.source_device_type,a3.sn,a3.model,a3.create_time,a3.lane_no,a3.lane_type,case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,case when a3.queue_len between 0 and 500 then a3.queue_len else a2.avg_queue_len end as queue_len,case when a3.queue_head between 0 and 500 then a3.queue_head else a2.avg_queue_head end as queue_head,case when a3.queue_tail between 0 and 500 then a3.queue_tail else a2.avg_queue_tail end as queue_tail,a3.day from hurys_db.dwd_queue_error as a3 right join (selecta1.device_no,a1.create_time,a1.lane_no,round(avg(queue_count),0) avg_queue_count,round(avg(queue_len),2) avg_queue_len,round(avg(queue_head),2) avg_queue_head,round(avg(queue_tail),2) avg_queue_tail from(select t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no, t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail from hurys_db.dwd_queue as t1 right join hurys_db.dwd_queue_error as t2 on t2.device_no=t1.device_no and t2.lane_no=t1.lane_noand concat(date_sub(t2.create_time,7),substr(t2.create_time,11,10)) = t1.create_time where t1.device_no is not null union all select t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no, t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail from hurys_db.dwd_queue as t1 right join hurys_db.dwd_queue_error as t2 on t2.device_no=t1.device_no and t2.lane_no=t1.lane_noand concat(date_sub(t2.create_time,14),substr(t2.create_time,11,10)) = t1.create_time where t1.device_no is not null union all select t1.device_no, t1.create_time start_time, t2.create_time, t1.lane_no, t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail from hurys_db.dwd_queue as t1 right join hurys_db.dwd_queue_error as t2 on t2.device_no=t1.device_no and t2.lane_no=t1.lane_noand concat(date_sub(t2.create_time,21),substr(t2.create_time,11,10)) = t1.create_time where t1.device_no is not null ) as a1 group by a1.device_no, a1.create_time, a1.lane_no) as a2 on a3.device_no=a2.device_no and a3.create_time=a2.create_time and a3.lane_no=a2.lane_no where a3.day='2024-09-04' ;
三、ClickHouse中现有代码
--43、静态排队字段数据修复--修复策略:使用前三周同期数据取平均进行修复 selecta3.id,a3.device_no,a3.source_device_type,a3.sn,a3.model,a3.create_time,a3.lane_no,a3.lane_type,case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count, case when a3.queue_len between 0 and 500 then a3.queue_len else cast(a2.avg_queue_len as Decimal(10,2)) end as queue_len,case when a3.queue_head between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head as Decimal(10,2)) end as queue_head,case when a3.queue_tail between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail as Decimal(10,2)) end as queue_tail,cast(a3.day as String) day from hurys_jw.dwd_queue_error as a3 right join ( selectdevice_no,start_time,lane_no,round(avg(queue_count),0) avg_queue_count,round(avg(queue_len) ,2) avg_queue_len,round(avg(queue_head),2) avg_queue_head,round(avg(queue_tail),2) avg_queue_tail from( select t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail from hurys_jw.dwd_queue as t1 inner join(selectdevice_no,lane_no,create_time,(create_time - interval 7 day) create_time_7,(create_time - interval 14 day)create_time_14,(create_time - interval 21 day)create_time_21 from hurys_jw.dwd_queue_error) as t2 on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time where t1.device_no is not null union all select t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail from hurys_jw.dwd_queue as t1 inner join(selectdevice_no,lane_no,create_time,(create_time - interval 7 day) create_time_7,(create_time - interval 14 day)create_time_14,(create_time - interval 21 day)create_time_21 from hurys_jw.dwd_queue_error) as t2 on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time where t1.device_no is not null union all select t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail from hurys_jw.dwd_queue as t1 inner join(selectdevice_no,lane_no,create_time,(create_time - interval 7 day) create_time_7,(create_time - interval 14 day)create_time_14,(create_time - interval 21 day)create_time_21 from hurys_jw.dwd_queue_error) as t2 on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time where t1.device_no is not null) where lane_no is not null group by device_no, start_time, lane_no) as a2 on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no where a3.day >= ? ;
注意:Hive中原有SQL语句和ClickHouse现有SQL语句很大不同
四、Kettle任务
方框标记是修复数据,高频率执行
下面其他几个是修复记录任务
4.1 newtime
4.2 替换NULL值
4.3 clickhouse输入
select
a3.id,
a3.device_no,
a3.source_device_type,
a3.sn,
a3.model,
a3.create_time,
a3.lane_no,
a3.lane_type,
case when a3.queue_count between 0 and 100 then a3.queue_count else a2.avg_queue_count end as queue_count,
case when a3.queue_len between 0 and 500 then a3.queue_len else cast(a2.avg_queue_len as Decimal(10,2)) end as queue_len,
case when a3.queue_head between 0 and 500 then a3.queue_head else cast(a2.avg_queue_head as Decimal(10,2)) end as queue_head,
case when a3.queue_tail between 0 and 500 then a3.queue_tail else cast(a2.avg_queue_tail as Decimal(10,2)) end as queue_tail,
cast(a3.day as String) day
from hurys_jw.dwd_queue_error as a3
right join (
select
device_no,
start_time,
lane_no,
round(avg(queue_count),0) avg_queue_count,
round(avg(queue_len) ,2) avg_queue_len,
round(avg(queue_head),2) avg_queue_head,
round(avg(queue_tail),2) avg_queue_tail
from(
select
t1.device_no, t2.create_time start_time,t2.create_time_7 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
device_no,lane_no,create_time,
(create_time - interval 7 day) create_time_7,
(create_time - interval 14 day)create_time_14,
(create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_7=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_14 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
device_no,lane_no,create_time,
(create_time - interval 7 day) create_time_7,
(create_time - interval 14 day)create_time_14,
(create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_14=t1.create_time
where t1.device_no is not null
union all
select
t1.device_no, t2.create_time start_time,t2.create_time_21 create_time, t1.lane_no,t1.queue_count, t1.queue_len, t1.queue_head, t1.queue_tail
from hurys_jw.dwd_queue as t1
inner join(select
device_no,lane_no,create_time,
(create_time - interval 7 day) create_time_7,
(create_time - interval 14 day)create_time_14,
(create_time - interval 21 day)create_time_21
from hurys_jw.dwd_queue_error) as t2
on t2.device_no=t1.device_no and t2.lane_no=t1.lane_no and t2.create_time_21=t1.create_time
where t1.device_no is not null)
where lane_no is not null
group by device_no, start_time, lane_no
) as a2
on a3.device_no=a2.device_no and a3.create_time=a2.start_time and a3.lane_no=a2.lane_no
where a3.day >= ?
;
4.4 字段选择
4.5 clickhouse输出
4.6 执行SQL脚本
由于是对每一天的错误进行修复,因此每次执行后需要先删除这个分区的错误数据。因为每次执行清洗后都会先执行错误数据任务!
4.7 执行任务
4.8 海豚调度
注意在DWD层静态排队数据清洗、DWD层静态排队错误数据之后