[ceph] Does replication in osds and pools add up?

asked 2019-12-03 03:52:25 -0600

BenvanDamme gravatar image

Hi,

I am currently investigating the usage of our ceph storage and I think it might be that I made a mistake withe the replication. I have three physical nodes with six disks each. Is it possible that i have an absolute replication count of nine over the whole cluster? Each object three times in the pool and the pool three times over the osds? And if that is the case. What would be the best way to fix this.

Thanks for your help.

root@DIR2:/home/ubuntu# docker exec -ti ceph_mon rados df
POOL_NAME                 USED    OBJECTS CLONES COPIES  MISSING_ON_PRIMARY UNFOUND DEGRADED RD_OPS     RD      WR_OPS     WR
.rgw.root                 3.08KiB       8      0      24                  0       0        0        510  340KiB          8    8KiB
backups                       19B       2      0       6                  0       0        0        314  241KiB      84514  110GiB
default.rgw.buckets.data  72.3MiB     523      0    1569                  0       0        0       1350 57.4MiB       4946 72.5MiB
default.rgw.buckets.index      0B       1      0       3                  0       0        0       3229 4.60MiB       2059      0B
default.rgw.control            0B       8      0      24                  0       0        0          0      0B          0      0B
default.rgw.log                0B     207      0     621                  0       0        0   79766049 76.1GiB   53150230      0B
default.rgw.meta             780B       4      0      12                  0       0        0       1107 1018KiB         84   13KiB
gnocchi                   1.56GiB  128638      0  385914                  0       0        0  506847789  871GiB  546727389  242GiB
images                    35.1GiB    4536      0   13608                  0       0        0     292167  114GiB     105460 98.8GiB
vms                       70.6GiB   18328      0   54984                  0       0        0 1418007932 75.7TiB 1283170530 11.8TiB
volumes                   3.05TiB  802897   2489 2408691                  0       0        0 1259978578 20.3TiB 1200805704 45.0TiB

total_objects    955152
total_used       9.44TiB
total_avail      5.20TiB
total_space      14.6TiB




root@DIR2:/home/ubuntu# docker exec -ti ceph_mon rbd -p volumes du
NAME                                                                                      PROVISIONED    USED
volume-083f6e89-8ae5-4fde-908d-e52241cd0537@snapshot-5a98b309-313e-4ccf-8113-e0353c3ecf74       40GiB 19.4GiB
volume-083f6e89-8ae5-4fde-908d-e52241cd0537                                                     40GiB 19.6GiB
volume-12d0cba0-e2ec-4e5b-9dd2-fd1e36e03663@snapshot-79ad1e0f-1c91-4104-8490-eadeee28c572       40GiB 37.5GiB
volume-12d0cba0-e2ec-4e5b-9dd2-fd1e36e03663                                                     40GiB 37.1GiB
volume-24a65e11-1432-4f87-9917-ebbb6d96574d                                                     20GiB 9.82GiB
volume-3f76f808-7870-4e4e-9b74-e2225f1f6594                                                     61GiB 53.3GiB
volume-46a03999-c012-4565-8522-948a86f4175f                                                     35GiB 5.17GiB
volume-4cc292e1-42f1-4060-a520-b46fee51640e                                                    300GiB 76.1GiB
volume-4d44c495-213f-4265-ac3c-a3fe4b43f642                                                     40GiB 36.7GiB
volume-582da869-6e89-49f5-b493-467646eb3d46                                                     30GiB 16.7GiB
volume-6f22fdb0-e3c9-4f7d-9e5c-bbf2d53cd2cf                                                     30GiB 5.38GiB
volume-6f3440bc-6360-4cd6-97f7-1d60ef350280                                                     40GiB 23.8GiB
volume-75c84d73-86cc-44d7-8037-9db46b28444f@snapshot-cc1e98c0-e6a5-4694-805f-7612be34251a       35GiB 32.3GiB
volume-75c84d73-86cc-44d7-8037-9db46b28444f                                                     35GiB      0B
volume-7d2a4d7c-7634-47ad-92f3-e776400f2141                                                     10GiB 9.97GiB
volume-83fadc2f-7bf2-447a-838a-2fcd07622255                                                     80GiB 59.0GiB
volume-84bf07fb-2d90-4cd6-bce1-7d348c43677a                                                     40GiB 13.8GiB
volume-987c4a60-3a4f-4842-b097-4888e48b6158                                                     50GiB 37.6GiB
volume-a08780fd-169c-4382-9704-05f75877eac0                                                     30GiB 23.5GiB
volume-a2a60be3-71ff-46af-a277-3e32169a138a                                                    300GiB  294GiB
volume-a3db896f-3655-41a0-b33f-839f53fab082                                                     30GiB 21.3GiB
volume-ac44bea6-917d-4840-b64c-ebc10bb0f587                                                     40GiB 19.0GiB
volume-aeed382c-0de7-4912-9f14-7511867a1b92                                                     50GiB 30.0GiB
volume-b07efa2f-4955-4488-8bee-91574bd0b699                                                    150GiB  142GiB
volume-bb5049fc-edee-4c85-9d3b-21dd832600da                                                      5GiB 4.86GiB
volume-bf45d75e-38c5-48b5-966e-e81436094923                                                    300GiB 12.4GiB
volume-c9667c0b-50a1-49f0-9c8b-3abbe0d52478                                                     50GiB 39.5GiB
volume-e9d81955-4c00-4b9f-ab5f-23d658daf055                                                     40GiB 37.7GiB
<TOTAL>                                                                                       1.80TiB 1.09TiB











root@DIR2:/home/ubuntu# docker exec -ti ceph_mon ceph osd dump
epoch 8968
fsid 51276edb-2b73-4610-8a62-1d028a9ba8d6
created 2018-10-05 19:28:29.139438
modified 2019-12-03 08:19:45.327152
flags sortbitwise,recovery_deletes,purged_snapdirs
crush_version 131
full_ratio 0.95
backfillfull_ratio 0.9
nearfull_ratio 0.67
require_min_compat_client jewel
min_compat_client jewel
require_osd_release luminous
pool 1 '.rgw.root' replicated size 3 min_size 2 crush_rule 0 object_hash rjenkins pg_num 8 pgp_num 8 last_change 8513 flags hashpspool,nearfull stripe_width 0 application rgw
pool 2 'images' replicated size 3 min_size 2 crush_rule 1 object_hash rjenkins pg_num 128 pgp_num 128 last_change ...
(more)
edit retag flag offensive close merge delete

Comments

How do you count 9 replicas? First, all your pools show replicated size 3. Then you can check the number of objects for each pool (from your rados df output), the column for "OBJECTS" reflects the net value of objects while the column "CLONES" shows that value trippled.

eblock gravatar imageeblock ( 2019-12-03 09:18:54 -0600 )edit

The replication with factor 9 is just a guess. rbd -p volumes du outputs a usage of ~ 1TB for the volume pool. rados df outputs ~ 3TB and a raw disk usage of ~ 9TB therefore i guessed that I made a mistake and there are to many replications.

BenvanDamme gravatar imageBenvanDamme ( 2019-12-03 09:42:39 -0600 )edit

The rbd du command shows you the assigned space vs. the actual usage of a pool/image. Ceph rbd objects are usually stored as copy-on-write images. So the rados df command shows the assigned space in total which is around 3TB (net), but currently you're actually using around 1TB.

eblock gravatar imageeblock ( 2019-12-04 02:05:26 -0600 )edit

So the expected behaviour but why does ceph df detail say that the volume consumes ~ 9TB of raw storage? In total we have 14,6TiB physical storage and the monitoring is showing nearfull osds. Only from 1TB of stored elements.

BenvanDamme gravatar imageBenvanDamme ( 2019-12-04 02:57:10 -0600 )edit

Which ceph version are you running? Depending on the version the rados df output shows either net or gross value. My current understand would be: your volumes pool allocated 3 TB data (net), (actually currently uses 1,9 TB because of CoW) and because of replication size 3 the pool requires 9 TB.

eblock gravatar imageeblock ( 2019-12-04 03:56:47 -0600 )edit

If your cluster is nearful (don't let it reach full state!) try to find some clones and/or images you can remove. Using ceph with Openstack can create orphan objects within your pools which you have to cleanup manually, check that too.

eblock gravatar imageeblock ( 2019-12-04 03:58:54 -0600 )edit

We are running ceph version 12.2.8 (ae699615bac534ea496ee965ac6192cb7e0e07c0) luminous (stable). We created Openstack VM Volumes which are in total 1.8TiB. Actually used is 1.17TiB. ceph df detail lists the pool with 3.13 and suggests it consumes 9.38TB physical storage. How do I clean this up?

BenvanDamme gravatar imageBenvanDamme ( 2019-12-04 04:57:35 -0600 )edit

Yep, Luminous shows net value in rados df. So your volumes pool consumes around 3 TB at the moment, in total 9 TB, the numbers make sense. To cleanup I would suggest to list all rbd images in your pool(s) rbd -p <pool> ls and see if you can map all of them to existing instances/volumes.

eblock gravatar imageeblock ( 2019-12-04 05:43:09 -0600 )edit

The command rbd -p <pool> ls --long shows all relationships (glance images and their clones). Check for orphaned snapshots that are not needed anymore. If there isn't anything to cleanup make sure to expand your ceph cluster. Also check if your PG placement is equally balanced (ceph osd df).

eblock gravatar imageeblock ( 2019-12-04 05:45:38 -0600 )edit

Hi I compared rbd -p volume ls --long and openstack volume list there is nothing to cleanup. I have realy no idea how 1TB in used storage ends up consuming 9TB physical storage. That would mean that 1.77TB provisoned volumes would generate 1,33TB overhead if this is the correct behaviour.

BenvanDamme gravatar imageBenvanDamme ( 2019-12-04 08:58:09 -0600 )edit

When i compare it with the output of the image pool it seems to get obvious that something is wrong. rbd -p images du lists 35,1GB provisioned and also used. ceph df detail shows the exact 35.1 gb as used and not the three times the used value like in volumes.

BenvanDamme gravatar imageBenvanDamme ( 2019-12-04 10:11:15 -0600 )edit

Is it possible that it has something to do with the placement groups. For volumes it is set to 512 and for images to 128. Perhaps this is the reason for such a huge overhead.

BenvanDamme gravatar imageBenvanDamme ( 2019-12-04 10:13:56 -0600 )edit

The overhead is not that big, that's not it. But I see that your nearfull_ratio is at 67%, that is quite low, the default here is 85%. Is there a reason for such a low value?

eblock gravatar imageeblock ( 2019-12-05 01:56:46 -0600 )edit

The 67% I got from the calculator at http://florian.ca/ceph-calculator/ for our three node setup.

BenvanDamme gravatar imageBenvanDamme ( 2019-12-05 02:57:18 -0600 )edit

I don't have a definite answer for you, but especially the statistics (usage, availability etc.) in ceph are still not really clear and differ between versions. My assumption is that although your volumes just use 1.8 TB there are obviously snapshots present...

eblock gravatar imageeblock ( 2019-12-06 03:10:40 -0600 )edit

...Since they're copy-on-write their usage increases as soon as the data changes and that leads to more than the assigned 1.8 TB. Of course some overhead has to be taken into account, too. Unfortunately, I didn't take the time to watch the usage in our cluster in the beginning, now it's too large.

eblock gravatar imageeblock ( 2019-12-06 03:13:29 -0600 )edit

In the beginning we had some snapshots. I had to flatten some volumes to delete all of them. Because the 1.8TB are just the added up max vm-volumes set up while starting the instances. So i am still unable to understand where the space is wasted.

BenvanDamme gravatar imageBenvanDamme ( 2019-12-09 03:42:27 -0600 )edit

The max usage of the entire volumes is 1.8TB and 1.10TB is used. Three times 1.8TB should end up with a raw usage of 5.4TB while it realy uses 9.3TB. Is it possible that there are not listed zombie-volumes eating all the space? :D

BenvanDamme gravatar imageBenvanDamme ( 2019-12-09 03:45:04 -0600 )edit

Of course it is possible, I wrote that a couple of comments ago ;-) It's possible to delete instances etc. but in fact those rbd images can still exist in ceph if they had snapshots before. So you should not just compare if your openstack images etc. exist in ceph but also the other way around.

eblock gravatar imageeblock ( 2019-12-10 03:41:29 -0600 )edit

Ah yes this i do understand. I compared the volumes listed in openstack and in rbd. There is no difference. What i tried to say was, that even if it is not listed in rbd and openstack, is it possible that the data of deleted snaps or volumes is still on the osds?

BenvanDamme gravatar imageBenvanDamme ( 2019-12-10 04:30:10 -0600 )edit

Dumb question is there something like fsck for ceph, which scans for orphaned objects/pgs or somehting like that?

BenvanDamme gravatar imageBenvanDamme ( 2019-12-10 04:31:54 -0600 )edit

No, not for rbd objects. I believe there's only an orphan-scan for rados-gateway, and something similar for cephfs. But you can check if you find an rbd object for every rbd_data prefix you have in the pool. So you create a list of all objects in the pool rados -p volumes ls and only keep the...

eblock gravatar imageeblock ( 2019-12-10 05:10:11 -0600 )edit

middle part:

[rbd_data.]d18c7866334873[.00...7203]

Remove all duplicates of that list and then check if an actual object exists:

for i in `cat remainder`; do for j in `rbd -p volumes ls`; do if [ $(rbd info volumes/$j | grep -c $i) -gt 0 ]; then echo "Image: $j"; break; fi; done; done
eblock gravatar imageeblock ( 2019-12-10 05:13:53 -0600 )edit

Adjust the for loop to print those IDs that are not found so you get the orphans.

eblock gravatar imageeblock ( 2019-12-10 05:16:09 -0600 )edit

Okay. I think we are on the right way. I did it the other way round. Made a list of alle volumes and a list of all objects. And for every volume i deleted every line starting with the block_name_prefix of the volume from the list of objects. Started with 829718 Objects and now i have 528676 left.

BenvanDamme gravatar imageBenvanDamme ( 2019-12-10 08:03:29 -0600 )edit

for j in $(cat existent_volumes) do PREFIX="$(docker exec -ti ceph_mon rbd info volumes/${j} | grep block_name_prefix | awk -F ":" '{print $2}'| tr -d '[:space:]')" sed -i "s/${PREFIX}//g" objects done

BenvanDamme gravatar imageBenvanDamme ( 2019-12-10 08:04:04 -0600 )edit

Alright, sounds like there are quite a lot orphans, right? But before you purge those objects make sure that they really really don't belong to any existing rbd object or are related to any client etc. And then double and triple check, before deleting anything. ;-)

eblock gravatar imageeblock ( 2019-12-10 08:28:20 -0600 )edit