> 文章列表 > k8s部署Ceph

k8s部署Ceph

k8s部署Ceph

1. Helm部署

1.1 简介

为了将Ceph部署到K8S集群中,可以利用ceph-helm项目。 目前此项目存在一些限制:

  1. public和cluster网络必须一样
  2. 如果Storage的用户不是admin,你需要在Ceph集群中手工创建用户,并在K8S中创建对应的Secrets
  3. ceph-mgr只能运行单副本

1.2 仓库

执行下面的命令把ceph-helm添加到本地Helm仓库:

 # 此项目使用Helm本地仓库保存Chart,如果没有启动本地存储,请启动
nohup /usr/local/bin/helm serve  --address 0.0.0.0:8879 > /dev/null 2>&1 &git clone https://github.com/ceph/ceph-helm
pushd ceph-helm/ceph
make
popd
# 构建成功后Chart归档文件位于 ./ceph-0.1.0.tgz

1.3 覆盖值

可用值的说明如下:

ceph.yaml

# 部署哪些组件
deployment:ceph: truestorage_secrets: trueclient_secrets: truerbd_provisioner: truergw_keystone_user_and_endpoints: false# 修改这些值可以指定其它镜像
images:ks_user: docker.io/kolla/ubuntu-source-heat-engine:3.0.3ks_service: docker.io/kolla/ubuntu-source-heat-engine:3.0.3ks_endpoints: docker.io/kolla/ubuntu-source-heat-engine:3.0.3bootstrap: docker.io/ceph/daemon:tag-build-master-luminous-ubuntu-16.04dep_check: docker.io/kolla/ubuntu-source-kubernetes-entrypoint:4.0.0daemon: docker.io/ceph/daemon:tag-build-master-luminous-ubuntu-16.04ceph_config_helper: docker.io/port/ceph-config-helper:v1.7.5# 如果使用官方提供的StorageClass,你需要扩展kube-controller镜像,否则报executable file not found in $PATHrbd_provisioner: quay.io/external_storage/rbd-provisioner:v0.1.1minimal: docker.io/alpine:latestpull_policy: "IfNotPresent"# 不同Ceph组件使用什么节点选择器
labels:jobs:node_selector_key: ceph-monnode_selector_value: enabledmon:node_selector_key: ceph-monnode_selector_value: enabledmds:node_selector_key: ceph-mdsnode_selector_value: enabledosd:node_selector_key: ceph-osdnode_selector_value: enabledrgw:node_selector_key: ceph-rgwnode_selector_value: enabledmgr:node_selector_key: ceph-mgrnode_selector_value: enabledpod:dns_policy: "ClusterFirstWithHostNet"replicas:rgw: 1mon_check: 1rbd_provisioner: 2mgr: 1affinity:anti:type:default: preferredDuringSchedulingIgnoredDuringExecutiontopologyKey:default: kubernetes.io/hostname# 如果集群资源匮乏,可以调整下面的资源请求resources:enabled: falseosd:requests:memory: "256Mi"cpu: "100m"limits:memory: "1024Mi"cpu: "1000m"mds:requests:memory: "10Mi"cpu: "100m"limits:memory: "50Mi"cpu: "500m"mon:requests:memory: "50Mi"cpu: "100m"limits:memory: "100Mi"cpu: "500m"mon_check:requests:memory: "5Mi"cpu: "100m"limits:memory: "50Mi"cpu: "500m"rgw:requests:memory: "5Mi"cpu: "100m"limits:memory: "50Mi"cpu: "500m"rbd_provisioner:requests:memory: "5Mi"cpu: "100m"limits:memory: "50Mi"cpu: "500m"mgr:requests:memory: "5Mi"cpu: "100m"limits:memory: "50Mi"cpu: "500m"jobs:bootstrap:limits:memory: "1024Mi"cpu: "2000m"requests:memory: "128Mi"cpu: "100m"secret_provisioning:limits:memory: "1024Mi"cpu: "2000m"requests:memory: "128Mi"cpu: "100m"ks_endpoints:requests:memory: "128Mi"cpu: "100m"limits:memory: "1024Mi"cpu: "2000m"ks_service:requests:memory: "128Mi"cpu: "100m"limits:memory: "1024Mi"cpu: "2000m"ks_user:requests:memory: "128Mi"cpu: "100m"limits:memory: "1024Mi"cpu: "2000m"secrets:keyrings:mon: ceph-mon-keyringmds: ceph-bootstrap-mds-keyringosd: ceph-bootstrap-osd-keyringrgw: ceph-bootstrap-rgw-keyringmgr: ceph-bootstrap-mgr-keyringadmin: ceph-client-admin-keyringidentity:admin: ceph-keystone-adminuser: ceph-keystone-useruser_rgw: ceph-keystone-user-rgw# !! 根据实际情况网络配置
network:public:   10.0.0.0/16cluster:  10.0.0.0/16port:mon: 6789rgw: 8088# !! 在此添加需要的Ceph配置项
conf:# 对象存储网关服务相关rgw_ks:config:rgw_keystone_api_version: 3rgw_keystone_accepted_roles: "admin, _member_"rgw_keystone_implicit_tenants: truergw_s3_auth_use_keystone: trueceph:override:append:config:global:mon_host: nullosd:ms_bind_port_max: 7100ceph:rgw_keystone_auth: falseenabled:mds: truergw: truemgr: truestorage:# 基于目录的OSD,在宿主机上存储的路径# /var/lib/ceph-helm/osd会挂载到容器的/var/lib/ceph/osd目录osd_directory: /var/lib/ceph-helmmon_directory: /var/lib/ceph-helm# 将日志收集到/var/log,便于fluentd来采集mon_log: /var/log/ceph/monosd_log: /var/log/ceph/osd# !! 是否启用基于目录的OSD,需要配合节点标签ceph-osd=enabled
# 存储的位置由上面的storage.osd_directory确定,沿用现有的文件系统
osd_directory:enabled: false# 如果设置为1,则允许Ceph格式化磁盘,这会导致数据丢失
enable_zap_and_potentially_lose_data: true
# !! 基于块设备的OSD,需要配合节点标签ceph-osd-device-dev-*=enabled
osd_devices:- name: dev-vdb# 使用的块设备device: /dev/vdb# 日志可以存储到独立块设备上,提升性能,如果不指定,存放在devicejournal: /dev/vdc# 是否删除其分区表zap: "1"bootstrap:enabled: falsescript: |ceph -sfunction ensure_pool () {ceph osd pool stats $1 || ceph osd pool create $1 $2}ensure_pool volumes 8# 启用的mgr模块
ceph_mgr_enabled_modules:- restful- status# 配置mgr模块
ceph_mgr_modules_config:dashboard:port: 7000localpool:failure_domain: hostsubtree: rackpg_num: "128"num_rep: "3"min_size: "2"# 在部署/升级后,执行下面的命令
# 这些命令通过kubectl来执行
ceph_commands:
- ceph osd pool create  pg_num
- ceph osd crush tunables # Kubernetes 存储类配置
storageclass:provision_storage_class: trueprovisioner: ceph.com/rbd# 存储类名称name: ceph-rbdmonitors: nullcurshmap.src# 使用的RBD存储池的名称pool: rbdadmin_id: adminadmin_secret_name: pvc-ceph-conf-combined-storageclassadmin_secret_namespace: cephuser_id: adminuser_secret_name: pvc-ceph-client-key# RBD设备的镜像格式和特性image_format: "2"image_features: layeringendpoints:# 集群域名后缀cluster_domain_suffix: k8s.gmem.ccidentity:name: keystonenamespace: nullauth:admin:region_name: RegionOneusername: adminpassword: passwordproject_name: adminuser_domain_name: defaultproject_domain_name: defaultuser:role: adminregion_name: RegionOneusername: swiftpassword: passwordproject_name: serviceuser_domain_name: defaultproject_domain_name: defaulthosts:default: keystone-apipublic: keystonehost_fqdn_override:default: nullpath:default: /v3scheme:default: httpport:admin:default: 35357api:default: 80object_store:name: swiftnamespace: nullhosts:default: ceph-rgwhost_fqdn_override:default: nullpath:default: /swift/v1scheme:default: httpport:api:default: 8088ceph_mon:namespace: nullhosts:default: ceph-monhost_fqdn_override:default: nullport:mon:default: 6789

Ext4文件系统上基于目录的OSD配置,覆盖值示例:

network:public: 10.0.0.0/8cluster: 10.0.0.0/8conf:ceph:config:global:# Ext4文件系统filestore_xattr_use_omap: trueosd:ms_bind_port_max: 7100# Ext4文件系统osd_max_object_name_len: 256osd_max_object_namespace_len: 64osd_crush_update_on_start : falseceph:storage:osd_directory: /var/lib/ceph-helmmon_directory: /var/lib/ceph-helmmon_log: /var/log/ceph/monosd_log: /var/log/ceph/osd# 和操作系统共享一个分区,基于目录的OSD
osd_directory:enabled: truestorageclass:name: ceph-rbdpool: rbd

1.4 创建K8S资源

为Ceph创建名字空间:

kubectl create namespace ceph

创建RBAC资源:

kubectl create -f ceph-helm/ceph/rbac.yaml

了部署Ceph集群,需要为K8S集群中,不同角色(参与到Ceph集群中的角色)的节点添加标签:

  1. ceph-mon=enabled,部署mon的节点上添加
  2. ceph-mgr=enabled,部署mgr的节点上添加
  3. ceph-osd=enabled,部署基于设备、基于目录的OSD的节点上添加
  4. ceph-osd-device-NAME=enabled。部署基于设备的OSD的节点上添加,其中NAME需要替换为上面 ceph-overrides.yaml中的OSD设备名,即:
    1. ceph-osd-device-dev-vdb=enabled
    2. ceph-osd-device-dev-vdc=enabled

对应的K8S命令:

# 部署Ceph Monitor的节点
kubectl label node xenial-100 ceph-mon=enabled ceph-mgr=enabled
# 对于每个OSD节点
kubectl label node xenial-100 ceph-osd=enabled ceph-osd-device-dev-vdb=enabled ceph-osd-device-dev-vdc=enabled
kubectl label node xenial-101 ceph-osd=enabled ceph-osd-device-dev-vdb=enabled ceph-osd-device-dev-vdc=enabled

1.5 Release

helm install --name=ceph local/ceph --namespace=ceph -f ceph-overrides.yaml

1.6 检查状态

确保所有Pod正常运行:

 # kubectl -n ceph get pods
NAME                                    READY     STATUS    RESTARTS   AGE
ceph-mds-7cb7c647c7-7w6pc               0/1       Pending   0          18h
ceph-mgr-66cb85cbc6-hsm65               1/1       Running   3          1h
ceph-mon-check-758b88d88b-2r975         1/1       Running   1          1h
ceph-mon-gvtq6                          3/3       Running   3          1h
ceph-osd-dev-vdb-clj5f                  1/1       Running   15         1h
ceph-osd-dev-vdb-hldw5                  1/1       Running   15         1h
ceph-osd-dev-vdb-l4v6t                  1/1       Running   15         1h
ceph-osd-dev-vdb-v5jmd                  1/1       Running   15         1h
ceph-osd-dev-vdb-wm4v4                  1/1       Running   15         1h
ceph-osd-dev-vdb-zwr65                  1/1       Running   15         1h
ceph-osd-dev-vdc-27wfk                  1/1       Running   15         1h
ceph-osd-dev-vdc-4w4fn                  1/1       Running   15         1h
ceph-osd-dev-vdc-cpkxh                  1/1       Running   15         1h
ceph-osd-dev-vdc-twmwq                  1/1       Running   15         1h
ceph-osd-dev-vdc-x8tpb                  1/1       Running   15         1h
ceph-osd-dev-vdc-zfrll                  1/1       Running   15         1h
ceph-rbd-provisioner-5544dcbcf5-n846s   1/1       Running   4          18h
ceph-rbd-provisioner-5544dcbcf5-t84bz   1/1       Running   3          18h
ceph-rgw-7f97b5b85d-nc5fq               0/1       Pending   0          18h

其中MDS、RGW的Pod处于Pending状态,这是由于没有给任何节点添加标签:

# rgw即RADOS Gateway,是Ceph的对象存储网关服务,它是基于librados接口封装的FastCGI服务
# 提供存储和管理对象数据的REST API。对象存储适用于图片、视频等各类文件
# rgw兼容常见的对象存储API,例如绝大部分Amazon S3 API、OpenStack Swift API
ceph-rgw=enabled
# mds即Metadata Server,用于支持文件系统
ceph-mds=enabled

现在从监控节点,检查一下Ceph集群的状态:

# kubectl -n ceph exec -ti ceph-mon-gvtq6 -c ceph-mon -- ceph -scluster:# 集群标识符id:     08adecc5-72b1-4c57-b5b7-a543cd8295e7health: HEALTH_OKservices:# 监控节点mon: 1 daemons, quorum xenial-100# 管理节点mgr: xenial-100(active)# OSD(Ceph Data Storage Daemon)osd: 12 osds: 12 up, 12 indata:# 存储池、PG数量pools:   0 pools, 0 pgs# 对象数量objects: 0 objects, 0 bytes# 磁盘的用量,如果是基于文件系统的OSD,则操作系统用量也计算在其中usage:   1292 MB used, 322 GB / 323 GB avail# 所有PG都未激活,不可用pgs:     100.000% pgs not active# undersize是由于OSD数量不足(复制份数3,此时仅仅一个OSD),peerd表示128个PG配对到OSD128 undersized+peered# 将复制份数设置为1后,输出变为pgs:     100.000% pgs not active128 creating+peering# 过了一小段时间后,输出变为pgs: 128 active+clean# 到这里,PVC才能被提供,否则PVC状态显示 Provisioning,Provisioner日志中出现类似下面的:# attempting to acquire leader lease...# successfully acquired lease to provision for pvc ceph/ceph-pvc# stopped trying to renew lease to provision for pvc ceph/ceph-pvc, timeout reached

如果K8S集群没有默认StorageClass,可以设置:

kubectl patch storageclass ceph-rbd -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'

这样没有显式声明StorageClass的PVC将自动通过ceph-rbd进行卷提供。

1.7 创建存储池

# 创建具有384个PG的名为rbd的复制存储池
ceph osd pool create rbd 384 replicated
ceph osd pool set rbd min_size 1# 开发环境下,可以把Replica份数设置为1
ceph osd pool set rbd size 1
# min_size 会自动被设置的比size小
# 减小size后,可以立即看到ceph osd status的used变小# 初始化池,最好在所有节点加入后,调整好CURSH Map后执行
rbd pool init rbd# 可以创建额外的用户,例如下面的,配合Value storageclass.user_id=k8s使用
ceph auth get-or-create-key client.k8s mon 'allow r' osd 'allow rwx pool=rbd' | base64
# 如果使用默认用户admin,则不需要生成上面这步。admin权限也是足够的# 其它命令
# 查看块设备使用情况(需要MGR)
ceph osd status
+----+------------+-------+-------+--------+---------+--------+---------+-----------+
| id |    host    |  used | avail | wr ops | wr data | rd ops | rd data |   state   |
+----+------------+-------+-------+--------+---------+--------+---------+-----------+
| 0  | xenial-100 |  231M | 26.7G |    0   |  3276   |    0   |     0   | exists,up |
| 1  | xenial-103 |  216M | 26.7G |    0   |   819   |    0   |     0   | exists,up |
| 2  | xenial-101 |  253M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 3  | xenial-103 |  286M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 4  | xenial-101 |  224M | 26.7G |    0   |  1638   |    0   |     0   | exists,up |
| 5  | xenial-105 |  211M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 6  | xenial-100 |  243M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 7  | xenial-102 |  224M | 26.7G |    0   |  2457   |    0   |     0   | exists,up |
| 8  | xenial-102 |  269M | 26.7G |    0   |  1638   |    0   |     0   | exists,up |
| 9  | xenial-104 |  252M | 26.7G |    0   |  2457   |    0   |     0   | exists,up |
| 10 | xenial-104 |  231M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
| 11 | xenial-105 |  206M | 26.7G |    0   |     0   |    0   |     0   | exists,up |
+----+------------+-------+-------+--------+---------+--------+---------+-----------+

1.8 使用存储池

可以先使用ceph命令尝试创建RBD并挂载:

# 镜像格式默认2
# format 1 - 此格式兼容所有版本的 librbd 和内核模块,但是不支持较新的功能,像克隆。此格式目前已经废弃
# 2 - librbd 和 3.11 版以上内核模块才支持。此格式增加了克隆支持,未来扩展更容易
rbd create  test --size 1G --image-format 2 --image-feature layering# 映射为本地块设备,如果卡住,可能有问题,一段时间后会有提示
rbd map test
# CentOS 7 下可能出现如下问题:
#   rbd: sysfs write failed
#   In some cases useful info is found in syslog - try "dmesg | tail".
#   rbd: map failed: (5) Input/output error
# dmesg | tail
#   [1180891.928386] libceph: mon0 10.5.39.41:6789 feature set mismatch, 
#     my 2b84a042a42 < server's 40102b84a042a42, missing 401000000000000                                        
#   [1180891.934804] libceph: mon0 10.5.39.41:6789 socket error on read
# 解决办法是把Bucket算法从straw2改为straw# 挂载为目录
fdisk /dev/rbd0
mkfs.ext4 /dev/rbd0
mkdir /test
mount /dev/rbd0 /test# 测试性能
# 1MB块写入
sync; dd if=/dev/zero of=/test/data bs=1M count=512; sync
# 512+0 records in
# 512+0 records out
# 536870912 bytes (537 MB) copied, 4.44723 s, 121 MB/s
# 16K随机写
fio -filename=/dev/rbd0 -direct=1 -iodepth 1 -thread -rw=randwrite -ioengine=psync -bs=16k -size=512M -numjobs=30 -runtime=60 -name=test
# WRITE: bw=35.7MiB/s (37.5MB/s), 35.7MiB/s-35.7MiB/s (37.5MB/s-37.5MB/s), io=2148MiB (2252MB), run=60111-60111msec 
# 16K随机读
fio -filename=/dev/rbd0 -direct=1 -iodepth 1 -thread -rw=randread -ioengine=psync -bs=16k -size=512M -numjobs=30 -runtime=60 -name=test
# READ: bw=110MiB/s (116MB/s), 110MiB/s-110MiB/s (116MB/s-116MB/s), io=6622MiB (6943MB), run=60037-60037msec# 删除测试镜像
umount /test
rbd unmap test
rbd remove test 

确认Ceph RBD可以挂载、读写后,创建一个PVC:

kind: PersistentVolumeClaim
apiVersion: v1
metadata:name: ceph-pvcnamespace: ceph
spec:accessModes:- ReadWriteOnceresources:requests:storage: 1GistorageClassName: ceph-rbd

查看PVC是否绑定到PV:

kubectl -n ceph create -f ceph-pvc.yamlkubectl -n ceph get pvc# NAME       STATUS    VOLUME                                     CAPACITY   ACCESS MODES   STORAGECLASS   AGE
# ceph-pvc   Bound     pvc-43caef06-46b4-11e8-bed8-deadbeef00a0   1Gi        RWO            ceph-rbd       3s# 在Monitor节点上确认RBD设备已经创建
rbd ls
# kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47
rbd info kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47
# rbd image 'kubernetes-dynamic-pvc-fbddb77d-46b5-11e8-9204-8a12961e4b47':
#         size 128 MB in 32 objects
#         order 22 (4096 kB objects)
#         block_name_prefix: rbd_data.11412ae8944a
#         format: 2
#         features: layering
#         flags: 
#         create_timestamp: Mon Apr 23 05:20:07 2018

需要在其它命名空间中使用此存储池时,拷贝一下Secret:

kubectl -n ceph get secrets/pvc-ceph-client-key -o json --export | jq '.metadata.namespace = "default"' | kubectl create -f - 

1.9 卸载

helm delete ceph --purge
kubectl delete namespace ceph

此外,如果要重新安装,一定要把所有节点的一下目录清除掉:

rm -rf /var/lib/ceph-helm
rm -rf /var/lib/ceph

2. 使用 已存在Ceph集群

只需要安装相应的Provisioner,配置适当的StorageClass即可。示例:

  1. Provisioner:https://git.gmem.cc/alex/helm-charts/src/branch/master/ceph-provisioners
  2. 安装脚本:https://git.gmem.cc/alex/k8s-init/src/branch/master/4.infrastructure/0.ceph-external.sh

2.1 基于CephFS的卷

Kubernetes卷的动态Provisioning,目前需要依赖于external-storage项目,K8S没有提供内置的Provisioner。此项目存在不少问题,生产环境下可以考虑静态提供。

Provisioner会自动在Ceph集群的默认CephFS中创建“卷”,Ceph支持基于libcephfs+librados来实现一个基于CephFS目录的虚拟卷。

你可以在默认CephFS中看到volumes/kubernetes目录。kubernetes目录对应一个虚拟卷组。每个PV对应了它的一个子目录。