前提

UCloudNginxConsulUpsyncCentOS7.xIP192.168.56.200

动态负载均衡的基本原理

upstreamNginx
http {
    
    upstream upstream_server{
        server 127.0.0.1:8081;
        server 127.0.0.1:8082;
    }

    server {
        listen       80;
        server_name localhost;

        location / {
            proxy_pass http://upstream_server;
        }
    }
}
8081upstream
upstream upstream_server{
    # 添加down标记该端口的服务实例不参与负载
    server 127.0.0.1:8081 down;
    server 127.0.0.1:8082;
}
nginx -s reloadupstream
upstreamdownNginxupstreamnginx -s reloadupstreamdownNginxupstreamnginx -s reload
upstreamNginxnginx -s reloadNginx
upstreamNginxupstreamupstreamNginxreloadConsulNginxNginxCnginx-upsync-module

Consul安装和集群搭建

ConsulHashicorpGolangConsul
Service Segmentation/Service Mesh

下面是安装过程:

mkdir /data/consul
cd /data/consul
wget https://releases.hashicorp.com/consul/1.7.3/consul_1.7.3_linux_amd64.zip
# 注意解压后只有一个consul执行文件
unzip consul_1.7.3_linux_amd64.zip
nohup /data/consul/consul agent -server -data-dir=/tmp/consul -bootstrap -ui -advertise=192.168.56.200 -client=192.168.56.200 > /dev/null 2>&1 &ConsulConsulhttp://192.168.56.200:8500/UI

下面基于单台虚拟机搭建一个伪集群,「关于集群的一些配置属性的含义和命令参数的解释暂时不进行展开」

# 创建集群数据目录
mkdir /data/consul/node1 /data/consul/node2 /data/consul/node3
# 创建集群日志目录
mkdir /data/consul/node1/logs /data/consul/node2/logs /data/consul/node3/logs
/data/consul/node1consul_conf.json
{
  "datacenter": "es8-dc",
  "data_dir": "/data/consul/node1",
  "log_file": "/data/consul/node1/consul.log",
  "log_level": "INFO",
  "server": true,
  "node_name": "node1",
  "ui": true,
  "bind_addr": "192.168.56.200",
  "client_addr": "192.168.56.200",
  "advertise_addr": "192.168.56.200",
  "bootstrap_expect": 3,
  "ports":{
    "http": 8510,
    "dns": 8610,
    "server": 8310,
    "serf_lan": 8311,
    "serf_wan": 8312
    }
}
/data/consul/node2consul_conf.json
{
  "datacenter": "es8-dc",
  "data_dir": "/data/consul/node2",
  "log_file": "/data/consul/node2/consul.log",
  "log_level": "INFO",
  "server": true,
  "node_name": "node2",
  "ui": true,
  "bind_addr": "192.168.56.200",
  "client_addr": "192.168.56.200",
  "advertise_addr": "192.168.56.200",
  "bootstrap_expect": 3,
  "ports":{
    "http": 8520,
    "dns": 8620,
    "server": 8320,
    "serf_lan": 8321,
    "serf_wan": 8322
    }
}
/data/consul/node3consul_conf.json
{
  "datacenter": "es8-dc",
  "data_dir": "/data/consul/node3",
  "log_file": "/data/consul/node3/consul.log",
  "log_level": "INFO",
  "server": true,
  "node_name": "node3",
  "ui": true,
  "bind_addr": "192.168.56.200",
  "client_addr": "192.168.56.200",
  "advertise_addr": "192.168.56.200",
  "bootstrap_expect": 3,
  "ports":{
    "http": 8530,
    "dns": 8630,
    "server": 8330,
    "serf_lan": 8331,
    "serf_wan": 8332
    }
}

新建一个集群启动脚本:

cd /data/consul
touch service.sh
# /data/consul/service.sh内容如下:
nohup /data/consul/consul agent -config-file=/data/consul/node1/consul_conf.json > /dev/null 2>&1 &
sleep 10
nohup /data/consul/consul agent -config-file=/data/consul/node2/consul_conf.json -retry-join=192.168.56.200:8311 > /dev/null 2>&1 &
sleep 10
nohup /data/consul/consul agent -config-file=/data/consul/node3/consul_conf.json -retry-join=192.168.56.200:8311 > /dev/null 2>&1 &

如果集群启动成功,观察节点1中的日志如下:

HTTPLeader
ConsulConsulRaftLeaderHTTP192.168.56.200:8510
ConsulLeaderLeaderHTTPLeaderIP

Nginx编译安装

直接从官网下载二进制的安装包并且解压:

mkdir /data/nginx
cd /data/nginx
wget http://nginx.org/download/nginx-1.18.0.tar.gz
tar -zxvf nginx-1.18.0.tar.gz
/data/nginx/nginx-1.18.0pcre-develzlib-devel
yum -y install pcre-devel
yum install -y zlib-devel

编译命令如下:

cd /data/nginx/nginx-1.18.0
./configure --prefix=/data/nginx
./configure
make
cd /data/nginx/nginx-1.18.0
make
make
make install--prefix
cd /data/nginx/nginx-1.18.0
make install
make install/data/nginx
NginxsbinlogsconfNginx
/data/nginx/sbin/nginx
80Nginx

通过nginx-upsync-module和nginx_upstream_check_module模块进行编译

Nginxnginx-upsync-modulenginx_upstream_check_moduleNginx
mkdir /data/nginx/modules
cd /data/nginx/modules
# 这里是Github的资源,不能用wget下载,具体是:
nginx-upsync-module需要下载release里面的最新版本:v2.1.2
nginx_upstream_check_module需要下载整个项目的源码,主要用到靠近当前版本的补丁,使用patch命令进行补丁升级
/data/nginx/modules
ll /data/nginx/modules
drwxr-xr-x. 6 root root   4096 Nov  3  2019 nginx_upstream_check_module-master
drwxrwxr-x. 5 root root     93 Dec 18 00:56 nginx-upsync-module-2.1.2

编译前,还要先安装一些前置依赖组件:

yum -y install libpcre3 libpcre3-dev ruby zlib1g-dev patch
Nginx
cd /data/nginx/nginx-1.18.0
patch -p1 ./configure --prefix=/data/nginx --add-module=/data/nginx/modules/nginx_upstream_check_module-master --add-module=/data/nginx/modules/nginx-upsync-module-2.1.2
make
make install
makeNginxOpenRestyNginxIssue
check_1.12.1+.patch
# 提前把/data/nginx下除了之前下载过的modules目录外的所有文件删除
cd /data/nginx
wget http://nginx.org/download/nginx-1.14.2.tar.gz
tar -zxvf nginx-1.14.2.tar.gz

开始编译安装:

cd /data/nginx/nginx-1.14.2
patch -p1 ./configure --prefix=/data/nginx --add-module=/data/nginx/modules/nginx_upstream_check_module-master --add-module=/data/nginx/modules/nginx-upsync-module-2.1.2
make && make install
/data/nginx/sbin/nginx

启用动态负载均和健康检查

HTTPJavaGolang
package main

import (
 "flag"
 "fmt"
 "net/http"
)

func main() {
    var host string
    var port int
    flag.StringVar(&host, "h", "127.0.0.1", "IP地址")
    flag.IntVar(&port, "p", 9000, "端口")
    flag.Parse()
    address := fmt.Sprintf("%s:%d", host, port)
    http.HandleFunc("/ping", func(writer http.ResponseWriter, request *http.Request) {
        _, _ = fmt.Fprintln(writer, fmt.Sprintf("%s by %s", "pong", address))
    })
    http.HandleFunc("/", func(writer http.ResponseWriter, request *http.Request) {
        _, _ = fmt.Fprintln(writer, fmt.Sprintf("%s by %s", "hello world", address))
    })
    err := http.ListenAndServe(address, nil)
    if nil != err {
        panic(err)
    }
}

编译:

cd src
set GOARCH=amd64
set GOOS=linux
go build -o ../bin/app app.go
binLinuxapp90009001
# 记得先给app文件的执行权限chmod 773 app
nohup ./app -p 9000 >/dev/null 2>&1 &
nohup ./app -p 9001 >/dev/null 2>&1 &
Nginxupstream
# /data/nginx/conf/nginx.conf部分片段
http {
    include       mime.types;
    default_type  application/octet-stream;
    sendfile        on;
    keepalive_timeout  65;

    upstream app {
       # 这里是consul的leader节点的HTTP端点
       upsync 192.168.56.200:8510/v1/kv/upstreams/app/ upsync_timeout=6m upsync_interval=500ms upsync_type=consul strong_dependency=off;
       # consul访问不了的时候的备用配置
       upsync_dump_path /data/nginx/app.conf;
       # 这里是为了兼容Nginx的语法检查
       include /data/nginx/app.conf;
       # 下面三个配置是健康检查的配置
       check interval=1000 rise=2 fall=2 timeout=3000 type=http default_down=false;
       check_http_send "HEAD / HTTP/1.0\r\n\r\n";
       check_http_expect_alive http_2xx http_3xx;
    }

    server {
        listen       80;
        server_name  localhost;
        location / {
            proxy_pass http://app;
        }
        # 健康检查 - 查看负载均衡的列表
        location /upstream_list {
            upstream_show;
        }
        # 健康检查 - 查看负载均衡的状态
        location /upstream_status {
            check_status;
            access_log off;
        }
    }
}

# /data/nginx/app.conf
server 127.0.0.1:9000 weight=1 fail_timeout=10 max_fails=3;
server 127.0.0.1:9001 weight=1 fail_timeout=10 max_fails=3;
HTTPConsul
curl -X PUT -d '{"weight":1, "max_fails":2, "fail_timeout":10}' http://192.168.56.200:8510/v1/kv/upstreams/app/127.0.0.1:9000
curl -X PUT -d '{"weight":1, "max_fails":2, "fail_timeout":10}' http://192.168.56.200:8510/v1/kv/upstreams/app/127.0.0.1:9001
Nginx

动态负载均衡测试

Consul9000
curl -X PUT -d '{"weight":1, "max_fails":2, "fail_timeout":10, "down":1}' http://192.168.56.200:8510/v1/kv/upstreams/app/127.0.0.1:9000
9000downhttp://192.168.56.200hello world by 127.0.0.1:900190009000
curl -X PUT -d '{"weight":1, "max_fails":2, "fail_timeout":10, "down":0}' http://192.168.56.200:8510/v1/kv/upstreams/app/127.0.0.1:9000
http://192.168.56.200hello world by 127.0.0.1:9000hello world by 127.0.0.1:9001kill -9/upstream_status
http://192.168.56.200hello world by 127.0.0.1:90019000Consul9000downnginx_upstream_check_module
nginx_upstream_check_modulenginx-upsync-module

服务平滑发布

服务平滑发布依赖于前面花大量时间分析的动态负载均衡功能。笔者所在的团队比较小,所以选用了阿里云的云效作为产研管理平台,通过里面的流水线功能实现了服务平滑发布,下面是其中一个服务的生产环境部署的流水线:

其实平滑发布和平台的关系不大,整体的步骤大概如下:

shellsleep n
XConsulX_IP:PORTdown=1stopX_IP:PORTstartX_IP:PORTX_IP:PORTConsulX_IP:PORTdown=0
hard codeIPNginx

小结

CI/CD

参考资料:

  • nginx-upsync-module
  • Nginx docs
  • Consul docs

(本文完 c-7-d e-a-20200613 感谢广州某金融科技公司运维大佬昊哥提供的支持)

技术公众号《Throwable文摘》(id:throwable-doge),不定期推送笔者原创技术文章(绝不抄袭或者转载):