airflow和superset数据分析平台的搭建

一、环境搭建

  • OS: ubuntu 14.04
  • python: 2.7+

1.1 superset

  • 可视化数据分析平台
  • 默认使用sqlite数据库

1.1.1 安装

  1. 按相关依赖
    sudo apt-get install build-essential libssl-dev libffi-dev python-dev python-pip libsasl2-dev libldap2-dev
  2. 创建superset的虚拟环境并使用
    sudo pip install virtualenv
    virtualenv superset
    . ./superset/bin/activate
  3. 安装 pip 和setuptools最新版以及pymysql
    pip install --upgrade setuptools pip
    pip install pymysql
  4. 安装superset
    pip install superset

1.1.2 初始化

# 创建Admin用户
fabmanager create-admin --app superset
# 初始化数据库
superset db upgrade
# 加载示例
superset load_examples
# 创建默认的角色和初始化权限
superset init
# 测试: 启动superset 的web server,端口号为8888
superset runserver -p8888

1.1.3 添加数据源

  1. 使用创建的admin用户登录
  2. Source->Database->add a new record
    • 填写SQLAlchemy URI(dialect+driver://username:[email protected]:port/database):mysql+pymysql://root:[email protected]/data_warehouse?charset=utf8
    • 点击Test Connection测试是否成功
    • 选中Expose in SQL Lab以使用SQL Lab

1.1.4 使用supervisorctl 来管理superset服务

  1. 启动脚本
    # vi /home/vagrant/superset/run.sh
    #!/bin/bash
    . /home/vagrant/superset/bin/activate
    superset runserver -p 8888
  2. supervisor配置
    mkdir /home/vagrant/logs
    # vi /etc/supervisor/conf.d/superset.conf
    [program:superset_web]
    command=sh /home/vagrant/superset/run.sh
    directory=/home/vagrant/superset
    user=vagrant
    autostart=true
    stdout_logfile=/home/vagrant/logs/supervisor_superset.log
  3. 重启supervisor: sudo service supervisor restart

1.2 airflow

1.2.1 安装和配置

  • 安装
    1. 使用mysql作为元数据库
      # 创建相关数据库及账号  
      create database airflow default charset utf8 collate utf8_general_ci;  
      create user [email protected]'localhost' identified by 'airflow';  
      grant all on airflow.* to [email protected]'localhost';  
      flush privileges;
    2. 配置airflow
      # 配置 airflow 的 home 目录  
      mkdir -p /usr/local/airflow/{dags,logs,plugins}  
      echo "export AIRFLOW_HOME=/usr/local/airflow" >> /etc/profile  
      source /etc/profile
      # 安装依赖
      sudo apt-get install python-mysqldb
      # 安装apache-airflow
      sudo pip install apache-airflow
      # 配置元数据库  
      vi /usr/local/airflow/airflow.cfg
      # dialect+driver://username:[email protected]:port/database
      [core]
      sql_alchemy_conn = mysql://airflow:[email protected]:3306/airflow
      # 初始化元数据库连接(默认sqlite)
      airflow initdb
      # 启动web服务测试
      airflow webserver -p 8889
  • 配置
    1. 使用supervisorctl 来管理airflow服务
      # vi /etc/supervisor/conf.d/airlow.conf
      [program:airflow_web]
      environment = AIRFLOW_HOME="/usr/local/airflow"
      command=/usr/local/bin/airflow webserver -p 8889
      directory=/usr/local/airflow
      user=root
      autostart=true
      stdout_logfile=/root/logs/supervisor_airflow.log
      [program:airflow_scheduler]
      environment = AIRFLOW_HOME="/usr/local/airflow"
      command=/usr/local/bin/airflow scheduler
      directory=/usr/local/airflow
      user=root
      autostart=true
      stdout_logfile=/root/logs/supervisor_airflow.log
      #此时可以用 supervisorctl 来管理airflow服务了  
      supervisorctl start airflow_web  
      supervisorctl stop airflow_web  
      supervisorctl restart airflow_web

1.2.2 安全认证

  • 配置
    # 添加密码模块  
    sudo pip install airflow[password]
    # 启用访问认证  
    sudo vim /usr/local/airflow/airflow.cfg  
    [webserver]
    authenticate = true  
    auth_backend = airflow.contrib.auth.backends.password_auth  
    # 在 python 中执行添加账户:  
    import airflow  
    from airflow import models, settings
    from airflow.contrib.auth.backends.password_auth import PasswordUser
    user = PasswordUser(models.User())
    user.username = 'admin'
    user.email = '[email protected]'
    user.password = 'admin'
    session = settings.Session()  
    session.add(user)  
    session.commit()  
    session.close()  
    exit()  
    # 重启 airflow_web 服务
    sudo supervisorctl restart airflow_web

二、参考

标签: none

已有 2 条评论

  1. 考虑互换一下友链吗?

添加新评论