ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

大数据项目之业务数据采集(二)

2021-02-25 10:04:55  阅读:138  来源: 互联网

标签:info -% data 业务 采集 base import 数据 order


业务数据采集

平台模型搭建

在这里插入图片描述

1、业务采集

业务数据:与企业核心业务相关的业务,存放在MySQL数据库中,
需要将MySQL中的数据采集到hdfs中。

方案选择

1、数据传输:sqoop

优势:

  • 1、sqoop使用在业务场景,使用与数据导入方式是RDMS和HDFS互相导入
  • 2、批处理场景!在非实时的项目中,第二天导入数据,所以不需要流式处理,sqoop通过启动mapreduce且只有map,可以快速的将数据导入到HDFS
  • 3、开源免费

2、数据导入方式

一、全量
每天需要存一份完整数据,数据量不大,有更新和修改
二、增量
每天存储一份增量的数据,适用于数据量大,只有插入的操作
三、增量和变化
每天新增和变化,存储创建事件和操作事件都是今天的数据
四、特殊策略
特殊的维度表(比如客观世界维度,日期维度,地区维度)

在这里插入图片描述

2、数据导入

问题1:null值存储问题

Hive中的Null在底层是以“\N”来存储,而MySQL中的Null在底层就是Null,为了保证数据两端的一致性。在导出数据时采用–input-null-string和–input-null-non-string两个参数。导入数据时采用–null-string和–null-non-string。

导入脚本:

#!/bin/bash

#1、判断日期是否为空,  中括号判断,如果为空,执行||之后的内容,如果不为空,执行&&之后的内容
[ "$2" ] && datestr=$2 || datestr=$(date -d '-1 day' +%Y-%m-%d)

#公共参数
import_data(){
/opt/module/sqoop/bin/sqoop import \
--connect jdbc:mysql://hadoop102:3306/gmall \
--username root \
--password root123 \
--delete-target-dir \
--num-mappers 1 \
--query "$1"  \
--target-dir hdfs://hadoop102:8020/gmall/$2/$datestr \
--compress \
--compression-codec lzop \
--null-string '\\N' \
--null-non-string '\\N' \
--fields-terminated-by ,
}
#--compress \
#--compression-codec lzop \
#如果采用lzop的方式压缩,需要生成索引,格式:hadoop jar jar包位置,全类名,压缩文件所在位置
# hadoop jar /opt/module/hadoop/share/hadoop/common/hadoop-lzo-0.4.20.jar com.hadoop.compression.lzo.DistributedLzoIndexer hdfs://hadoop102:8020/gamll/$2/$datestr


#全量导入
import_base_dic(){
import_data "select * from base_dic where \$CONDITIONS" base_dic id
}

import_base_trademark(){
import_data "select * from base_trademark where \$CONDITIONS" base_trademark id
}

import_base_category3(){
import_data "select * from base_category3 where \$CONDITIONS" base_category3 id
}

import_base_category2(){
import_data "select * from base_category2 where \$CONDITIONS" base_category2 id
}

import_base_category1(){
import_data "select * from base_category1 where \$CONDITIONS" base_category1 id
}

import_activity_rule(){
import_data "select * from activity_rule where \$CONDITIONS" activity_rule id
}

import_activity_info(){
import_data "select * from activity_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" activity_info id
}

import_activity_sku(){
import_data "select * from activity_sku where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" activity_sku id
}

import_cart_info(){
import_data "select * from cart_info where \$CONDITIONS" cart_info id
}

import_favor_info(){
import_data "select * from favor_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" favor_info id
}

import_coupon_info(){
import_data "select * from coupon_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" coupon_info id
}

import_spu_info(){
import_data "select * from spu_info where \$CONDITIONS" spu_info id
}

import_sku_info(){
import_data "select * from sku_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" sku_info id
}


#新增导入
import_order_refund_info(){
import_data "select * from order_refund_info where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" order_refund_info id
}

import_order_refund_info_all(){
import_data "select * from order_refund_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" order_refund_info id
}

import_order_status_log(){
import_data "select * from order_status_log where DATE_FORMAT(operate_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" order_status_log id
}

import_order_status_log_all(){
import_data "select * from order_status_log where DATE_FORMAT(operate_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" order_status_log id
}

import_payment_info(){
import_data "select * from payment_info where DATE_FORMAT(payment_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" payment_info id
}

import_payment_info_all(){
import_data "select * from payment_info where DATE_FORMAT(payment_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" payment_info id
}

import_order_detail(){
import_data "select * from order_detail where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" order_detail id
}

import_order_detail_all(){
import_data "select * from order_detail where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" order_detail id
}

import_activity_order(){
import_data "select * from activity_order where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" activity_order id
}

import_activity_order_all(){
import_data "select * from activity_order where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" activity_order id
}

import_comment_info(){
import_data "select * from comment_info where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" comment_info id
}

import_comment_info_all(){
import_data "select * from comment_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" comment_info id
}


#新增及修改导入
import_user_info(){
import_data "select * from user_info where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" user_info id
}

import_user_info_all(){
import_data "select * from user_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" user_info id
}

import_coupon_use(){
import_data "select * from coupon_use where DATE_FORMAT(get_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(using_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(used_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(expire_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" coupon_use id
}

import_coupon_use_all(){
import_data "select * from coupon_use where DATE_FORMAT(get_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(used_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(expire_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" coupon_use id
}

import_order_info(){
import_data "select * from order_info where DATE_FORMAT(create_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')='$datestr' or DATE_FORMAT(expire_time,'%Y-%m-%d')='$datestr' and \$CONDITIONS" order_info id
}

import_order_info_all(){
import_data "select * from order_info where DATE_FORMAT(create_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(operate_time,'%Y-%m-%d')<='$datestr' or DATE_FORMAT(expire_time,'%Y-%m-%d')<='$datestr' and \$CONDITIONS" order_info id
}

#特殊导入
import_base_region(){
import_data "select * from base_region where \$CONDITIONS" base_region id
}

import_base_province(){
import_data "select * from base_province where \$CONDITIONS" base_province id
}

#2、根据参数的第一个参数,确定导入策略
case $1 in
#第一次导入,全部导入:全量、新增、新增及修改,特殊策略
"first")
import_base_dic
import_base_trademark
import_base_category3
import_base_category2
import_base_category1
import_activity_rule
import_activity_info
import_activity_sku
import_cart_info
import_favor_info
import_coupon_info
import_sku_info
import_spu_info

import_order_refund_info_all
import_order_status_log_all
import_payment_info_all
import_order_detail_all
import_activity_order_all
import_comment_info_all

import_coupon_use_all
import_user_info_all
import_order_info_all

import_base_province
import_base_region
;;

#非第一次导入,导入:全量、新增、新增及修改
"second")
import_base_dic
import_base_trademark
import_base_category3
import_base_category2
import_base_category1
import_activity_rule
import_activity_info
import_activity_sku
import_cart_info
import_favor_info
import_coupon_info
import_sku_info
import_spu_info

import_order_refund_info
import_order_status_log
import_payment_info
import_order_detail
import_activity_order
import_comment_info

import_coupon_use
import_user_info
import_order_info
;;
#########################按照表名导入############################
# 编码字典表
"base_dic")
    import_base_dic
;;
# 品牌表
"base_trademark")
    import_base_trademark
;;
# 商品一级分类
"base_category1")
    import_base_category1
;;
# 商品二级分类
"base_category2")
    import_base_category2
;;
# 商品三级分类
"base_category3")
    import_base_category3
;;
# 优惠活动表
"activity_rule")
    import_activity_rule
;;
# 活动表
"activity_info")
    import_activity_info
;;
# 活动参与商品表
"activity_sku")
    import_activity_sku
;;

# 加购表
"cart_info")
    import_cart_info
;;

# 商品收藏表
"favor_info")
    import_favor_info
;;

# 优惠券表
"coupon_info")
    import_coupon_info
;;

# SKU商品表
"sku_info")
     import_sku_info
;;

# spu商品表
"spu_info")
      import_spu_info
;;


########## 新增  ##########

# 退单表
"order_refund_info")
      import_order_refund_info
;;

# 订单状态表
"order_status_log")
      import_order_status_log
;;

# 支付流水表
"payment_info")
     import_payment_info
;;

# 订单详情表
"order_detail")
     import_order_detail
;;

# 活动与订单关联表
"activity_order")
      import_activity_order
;;

# 商品评论表
"comment_info")
      import_comment_info
;;

#####  新增和变化表  ##########

# 优惠券领用表
"coupon_use")
      import_coupon_use
;;

# 用户表
"user_info")
     import_user_info
;;

# 订单表
"order_info")
     import_order_info
;;

######  特殊表  ####
# 省份表
"base_province")
     import_base_province
;;

# 地区表
  "base_region")
     import_base_region
;;


*)
	echo "表名不存在或参数输入有误"
;;
esac

标签:info,-%,data,业务,采集,base,import,数据,order
来源: https://blog.csdn.net/qq_38705144/article/details/114059969

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有