Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix riemann-health memory reporting when using ZFS on Linux #289

Merged
merged 2 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions lib/riemann/tools/health.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class Health
include Riemann::Tools
include Riemann::Tools::Utils

PROC_PID_INIT_INO = 0xEFFFFFFC
SI_UNITS = '_kMGTPEZYRQ'

opt :cpu_warning, 'CPU warning threshold (fraction of total jiffies)', default: 0.9
Expand Down Expand Up @@ -158,6 +159,11 @@ def report_uptime(uptime)
end
end

def linux_running_in_container?
@linux_running_in_container = File.readlink('/proc/self/ns/pid') != "pid:[#{PROC_PID_INIT_INO}]" if @linux_running_in_container.nil?
@linux_running_in_container
end

def linux_cpu
new = File.read('/proc/stat')
unless new[/cpu\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/]
Expand Down Expand Up @@ -198,13 +204,44 @@ def linux_memory
info[x[0]] = x[1].to_i
end

free = m['MemFree'].to_i + m['Buffers'].to_i + m['Cached'].to_i
total = m['MemTotal'].to_i
free = m['MemFree'] + m['Buffers'] + m['Cached'] + linux_zfs_arc_evictable_memory
total = m['MemTotal']
fraction = 1 - (free.to_f / total)

report_pct :memory, fraction, "used\n\n#{reverse_numeric_sort_with_header(`ps -eo pmem,pid,comm`)}"
end

# On Linux, the ZFS ARC is reported as used, not as cached memory.
# /~https://github.com/openzfs/zfs/issues/10251
#
# Gather ZFS ARC statisticts about evictable memory. The available
# fields are listed here:
# /~https://github.com/openzfs/zfs/blob/master/include/sys/arc_impl.h
def linux_zfs_arc_evictable_memory
# When the system is a container, it can access the hosts stats that
# cause invalid memory usage reporting. We should only remove
# evictable memory from the ZFS ARC on the host system.
return 0 if linux_running_in_container?

m = File.readlines('/proc/spl/kstat/zfs/arcstats').each_with_object(Hash.new(0)) do |line, info|
x = line.split(/\s+/)
info[x[0]] = x[2].to_i
end

(
m['anon_evictable_data'] +
m['anon_evictable_metadata'] +
m['mru_evictable_data'] +
m['mru_evictable_metadata'] +
m['mfu_evictable_data'] +
m['mfu_evictable_metadata'] +
m['uncached_evictable_data'] +
m['uncached_evictable_metadata']
) / 1024 # We want kB...
rescue Errno::ENOENT
0
end

def freebsd_cpu
u2, n2, s2, t2, i2 = `sysctl -n kern.cp_time 2>/dev/null`.split.map(&:to_i) # FreeBSD has 5 cpu stats

Expand Down
6 changes: 2 additions & 4 deletions lib/riemann/tools/http_check.rb
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,8 @@ def tick
def test_uri_addresses(uri, addresses)
request = get_request(uri)

responses = []

addresses.each do |address|
responses << test_uri_address(uri, address.to_s, request)
responses = addresses.map do |address|
test_uri_address(uri, address.to_s, request)
end

responses.compact!
Expand Down
159 changes: 159 additions & 0 deletions spec/riemann/tools/health_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,165 @@
end
end

describe '#linux_running_in_container?' do
before do
allow(File).to receive(:readlink).with('/proc/self/ns/pid').and_return(pid_namespace)
end

context 'when running on the host' do
let(:pid_namespace) { 'pid:[4026531836]' }

it 'returns the expected value' do
expect(subject).not_to be_linux_running_in_container
end
end

context 'when running in a container' do
let(:pid_namespace) { 'pid:[4026532474]' }

it 'returns the expected value' do
expect(subject).to be_linux_running_in_container
end
end
end

describe '#linux_zfs_arc_evictable_memory' do
before do
allow(subject).to receive(:linux_running_in_container?).and_return(false)
allow(File).to receive(:readlines).with('/proc/spl/kstat/zfs/arcstats').and_return(<<~OUTPUT.split("\n"))
12 1 0x01 123 33456 16771914747167 65909736923948
name type data
hits 4 4194887
misses 4 10500
demand_data_hits 4 107986
demand_data_misses 4 2
demand_metadata_hits 4 4058473
demand_metadata_misses 4 9216
prefetch_data_hits 4 28396
prefetch_data_misses 4 1207
prefetch_metadata_hits 4 32
prefetch_metadata_misses 4 75
mru_hits 4 882890
mru_ghost_hits 4 7737
mfu_hits 4 3311966
mfu_ghost_hits 4 2306
deleted 4 42072676
mutex_miss 4 1771
access_skip 4 0
evict_skip 4 1004
evict_not_enough 4 68
evict_l2_cached 4 0
evict_l2_eligible 4 5516808656384
evict_l2_eligible_mfu 4 216467456
evict_l2_eligible_mru 4 5516592188928
evict_l2_ineligible 4 6029312
evict_l2_skip 4 0
hash_elements 4 124644
hash_elements_max 4 158917
hash_collisions 4 1256052
hash_chains 4 1793
hash_chain_max 4 4
p 4 8383553536
c 4 16767066112
c_min 4 1047941632
c_max 4 16767066112
size 4 16791717984
compressed_size 4 16052915712
uncompressed_size 4 16066757120
overhead_size 4 685315584
hdr_size 4 40836960
data_size 4 16724525056
metadata_size 4 13706240
dbuf_size 4 3017856
dnode_size 4 8890304
bonus_size 4 734400
anon_size 4 205520896
anon_evictable_data 4 0
anon_evictable_metadata 4 0
mru_size 4 16532327936
mru_evictable_data 4 15577382912
mru_evictable_metadata 4 2461696
mru_ghost_size 4 225443840
mru_ghost_evictable_data 4 52822016
mru_ghost_evictable_metadata 4 172621824
mfu_size 4 382464
mfu_evictable_data 4 0
mfu_evictable_metadata 4 0
mfu_ghost_size 4 18432
mfu_ghost_evictable_data 4 0
mfu_ghost_evictable_metadata 4 18432
l2_hits 4 0
l2_misses 4 0
l2_prefetch_asize 4 0
l2_mru_asize 4 0
l2_mfu_asize 4 0
l2_bufc_data_asize 4 0
l2_bufc_metadata_asize 4 0
l2_feeds 4 0
l2_rw_clash 4 0
l2_read_bytes 4 0
l2_write_bytes 4 0
l2_writes_sent 4 0
l2_writes_done 4 0
l2_writes_error 4 0
l2_writes_lock_retry 4 0
l2_evict_lock_retry 4 0
l2_evict_reading 4 0
l2_evict_l1cached 4 0
l2_free_on_write 4 0
l2_abort_lowmem 4 0
l2_cksum_bad 4 0
l2_io_error 4 0
l2_size 4 0
l2_asize 4 0
l2_hdr_size 4 0
l2_log_blk_writes 4 0
l2_log_blk_avg_asize 4 0
l2_log_blk_asize 4 0
l2_log_blk_count 4 0
l2_data_to_meta_ratio 4 0
l2_rebuild_success 4 0
l2_rebuild_unsupported 4 0
l2_rebuild_io_errors 4 0
l2_rebuild_dh_errors 4 0
l2_rebuild_cksum_lb_errors 4 0
l2_rebuild_lowmem 4 0
l2_rebuild_size 4 0
l2_rebuild_asize 4 0
l2_rebuild_bufs 4 0
l2_rebuild_bufs_precached 4 0
l2_rebuild_log_blks 4 0
memory_throttle_count 4 0
memory_direct_count 4 0
memory_indirect_count 4 0
memory_all_bytes 4 33534132224
memory_free_bytes 4 14101360640
memory_available_bytes 3 12877639168
arc_no_grow 4 0
arc_tempreserve 4 132096
arc_loaned_bytes 4 0
arc_prune 4 0
arc_meta_used 4 65964976
arc_meta_limit 4 12575299584
arc_dnode_limit 4 1257529958
arc_meta_max 4 129330864
arc_meta_min 4 16777216
async_upgrade_sync 4 2597
demand_hit_predictive_prefetch 4 907
demand_hit_prescient_prefetch 4 0
arc_need_free 4 0
arc_sys_free 4 1223721472
arc_raw_size 4 0
cached_only_in_progress 4 0
abd_chunk_waste_size 4 7168
OUTPUT
end

it 'return the expected size' do
expect(subject.linux_zfs_arc_evictable_memory).to eq(15_214_692)
end
end

describe('#disks') do
before do
allow(subject).to receive(:df).and_return(<<~OUTPUT)
Expand Down
6 changes: 2 additions & 4 deletions tools/riemann-docker/lib/riemann/tools/docker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,8 @@ def tick
disk if @disk_enabled

# Get CPU, Memory and Load of each container
threads = []

containers.each do |ctr|
threads << Thread.new(ctr) do |container|
threads = containers.map do |ctr|
Thread.new(ctr) do |container|
id = container.id
name = get_container_name(container)

Expand Down