add summary reporting, rework expiry logic

This commit is contained in:
holger krekel
2025-09-11 16:17:50 +02:00
parent b13929119b
commit 023116bc91
4 changed files with 213 additions and 130 deletions

View File

@@ -28,6 +28,7 @@ filtermail = "chatmaild.filtermail:main"
echobot = "chatmaild.echo:main" echobot = "chatmaild.echo:main"
chatmail-metrics = "chatmaild.metrics:main" chatmail-metrics = "chatmaild.metrics:main"
expire = "chatmaild.expire:main" expire = "chatmaild.expire:main"
fsreport = "chatmaild.fsreport:main"
lastlogin = "chatmaild.lastlogin:main" lastlogin = "chatmaild.lastlogin:main"
turnserver = "chatmaild.turnserver:main" turnserver = "chatmaild.turnserver:main"

View File

@@ -1,42 +1,47 @@
"""
Expire old messages and addresses.
"""
import os import os
import shutil import shutil
import sys import sys
import time
from collections import namedtuple
from datetime import datetime from datetime import datetime
from stat import S_ISREG from stat import S_ISREG
from chatmaild.config import read_config from chatmaild.config import read_config
# delete already seen big mails after 7 days, in the INBOX # XXX maildirsize (used by dovecot quota) needs to be removed after removing files
# 2 0 * * * vmail find {{ config.mailboxes_dir }} -path '*/cur/*' -mtime +{{ config.delete_large_after }} -size +200k -type f -delete
# # delete all mails after {{ config.delete_mails_after }} days, in the Inbox
# 3 0 * * * vmail find {{ config.mailboxes_dir }} -name 'maildirsize' -type f -delete
FileEntry = namedtuple("FileEntry", ["relpath", "mtime", "size"]) class FileEntry:
dayseconds = 24 * 60 * 60 def __init__(self, relpath, mtime, size):
monthseconds = dayseconds * 30 self.relpath = relpath
self.mtime = mtime
self.size = size
def __repr__(self):
return f"<FileEntry size={self.size} '{self.relpath}'>"
def fmt_size(self):
return f"{int(self.size/1000):5.0f}K"
def fmt_since(self, now):
diff_seconds = int(now) - int(self.mtime)
return f"{int(diff_seconds / 86400):2.0f}d"
def __eq__(self, other):
return (
self.relpath == other.relpath
and self.size == other.size
and self.mtime == other.mtime
)
def joinpath(name, extra): def joinpath(name, extra):
return name + "/" + extra return name + "/" + extra
def D(timestamp, now=datetime.utcnow().timestamp()):
diff_seconds = int(now) - int(timestamp)
# assert diff_seconds >= 0, (int(timestamp), int(now))
return f"{int(diff_seconds / dayseconds):2.0f}d"
def K(size):
return f"{int(size/1000):6.0f}K"
def M(size):
return f"{int(size/1000000):6.0f}M"
class Stats: class Stats:
def __init__(self, basedir, maxnum=None): def __init__(self, basedir, maxnum=None):
self.basedir = str(basedir) self.basedir = str(basedir)
@@ -53,9 +58,16 @@ class Stats:
class MailboxStat: class MailboxStat:
def __init__(self, mailboxdir): def __init__(self, mailboxdir):
self.mailboxdir = mailboxdir = str(mailboxdir) self.mailboxdir = mailboxdir = str(mailboxdir)
# all detected messages in cur/new/tmp folders
self.messages = [] self.messages = []
# all detected files in mailbox top dir
self.extrafiles = [] self.extrafiles = []
# total size of all detected files
self.totalsize = 0
# scan all relevant files (without recursion)
for name in os.listdir(mailboxdir): for name in os.listdir(mailboxdir):
fpath = joinpath(mailboxdir, name) fpath = joinpath(mailboxdir, name)
if name in ("cur", "new", "tmp"): if name in ("cur", "new", "tmp"):
@@ -66,11 +78,13 @@ class MailboxStat:
self.messages.append( self.messages.append(
FileEntry(relpath, mtime=st.st_mtime, size=st.st_size) FileEntry(relpath, mtime=st.st_mtime, size=st.st_size)
) )
self.totalsize += st.st_size
else: else:
st = os.stat(fpath) st = os.stat(fpath)
if S_ISREG(st.st_mode): if S_ISREG(st.st_mode):
self.extrafiles.append(FileEntry(name, st.st_mtime, st.st_size)) self.extrafiles.append(FileEntry(name, st.st_mtime, st.st_size))
self.extrafiles.sort(key=lambda x: x.size, reverse=True) self.totalsize += st.st_size
self.extrafiles.sort(key=lambda x: -x.size)
@property @property
def last_login(self): def last_login(self):
@@ -78,101 +92,8 @@ class MailboxStat:
if entry.relpath == "password": if entry.relpath == "password":
return entry.mtime return entry.mtime
def get_messages(self, prefix=""):
l = []
for entry in self.messages:
if entry.relpath.startswith(prefix):
l.append(entry)
return l
def get_extra_files(self):
return list(self.extrafiles)
def get_file_entry(self, name):
for entry in self.extrafiles:
if name == entry.relapth:
return entry
class XXXStats:
def __init__(self):
self.sum_extra = 0
self.sum_all_messages = 0
self.logins = []
self.messages = []
def analyze(self, statscache):
print("start")
for mailbox in statscache.cache:
mbox_cache = statscache.cache[mailbox]
if "password" not in mbox_cache:
continue
self.logins.append(mbox_cache["password"][0])
for relpath, (mtime, size) in mbox_cache.items():
if relpath[:4] in ("cur/", "new/", "tmp/"):
self.sum_all_messages += size
entry = FileEntry(relpath=relpath, mtime=mtime, size=size)
self.messages.append(entry)
else:
self.sum_extra += size
def dump_summary(self):
now = datetime.utcnow().timestamp()
print(f"size of everything: {M(self.sum_extra + self.sum_all_messages)}")
print(f"size all messages: {M(self.sum_all_messages)}")
percent = self.sum_extra / (self.sum_extra + self.sum_all_messages) * 100
print(f"size extra files: {M(self.sum_extra)} ({percent:.2f}%)")
for size in (100000, 200000, 500000, 1000000, 5000000):
all_of_size = sum(
x.size
for x in self.messages
if x.size > size and x.relpath.startswith("cur")
)
percent = all_of_size / self.sum_all_messages * 100
print(f"size seen {K(size)} messages: {M(all_of_size)} ({percent:.2f}%)")
for size in (100000, 200000, 500000, 1000000, 5000000):
all_of_size = sum(
x.size
for x in self.messages
if x.size > size and x.mtime < now - 2 * dayseconds
)
percent = all_of_size / self.sum_all_messages * 100
print(
f"size 2day-old {K(size)} messages: {M(all_of_size)} ({percent:.2f}%)"
)
for size in (100000, 200000, 500000, 1000000, 5000000):
all_of_size = sum(
x.size
for x in self.messages
if x.size > size
and x.relpath.startswith("cur")
and x.mtime < now - 7 * dayseconds
)
percent = all_of_size / self.sum_all_messages * 100
print(
f"size seen 7-day old {K(size)} messages: {M(all_of_size)} ({percent:.2f}%)"
)
print()
num_logins = len(self.logins)
monthly_active = len([x for x in self.logins if x >= now - monthseconds])
daily_active = len([x for x in self.logins if x >= now - dayseconds])
stale = num_logins - monthly_active
def p(num):
return f"({num/num_logins * 100:.2f}%)"
print(f"all logins: {K(num_logins)}")
print(f"monthly active: {K(monthly_active)} {p(monthly_active)}")
print(f">1m old logins: {K(stale)} {p(stale)}")
print(f"daily active: {K(daily_active)} {p(daily_active)}")
def run_expire(config, basedir, dry=False, maxnum=None):
now = time.time()
def run_expire(config, basedir, now, dry=True, maxnum=None):
stat = Stats(basedir, maxnum=maxnum) stat = Stats(basedir, maxnum=maxnum)
stat.iter_mailboxes() stat.iter_mailboxes()
cutoff_date_without_login = now - int(config.delete_inactive_users_after) * 86400 cutoff_date_without_login = now - int(config.delete_inactive_users_after) * 86400
@@ -188,7 +109,9 @@ def run_expire(config, basedir, dry=False, maxnum=None):
def unlink(mailboxdir, message): def unlink(mailboxdir, message):
if dry: if dry:
relpath = os.path.basename(mailboxdir) + message.relpath relpath = os.path.basename(mailboxdir) + message.relpath
print(f"would remove {D(message.mtime)} {K(message.size)} {relpath}") print(
f"would remove {message.fmt_since(now)} {message.fmt_size()} {relpath}"
)
else: else:
os.unlink(path) os.unlink(path)
@@ -217,7 +140,9 @@ def run_expire(config, basedir, dry=False, maxnum=None):
def main(): def main():
cfgpath, basedir, maxnum = sys.argv[1:] cfgpath, basedir, maxnum = sys.argv[1:]
config = read_config(cfgpath) config = read_config(cfgpath)
run_expire(config, basedir, dry=True, maxnum=int(maxnum)) now = datetime.utcnow().timestamp()
now = datetime(2025, 9, 9).timestamp()
run_expire(config, basedir, maxnum=int(maxnum), now=now)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -0,0 +1,150 @@
import os
import sys
from datetime import datetime
from chatmaild.config import read_config
from chatmaild.expire import FileEntry, Stats, joinpath
DAYSECONDS = 24 * 60 * 60
MONTHSECONDS = DAYSECONDS * 30
def D(timestamp, now=datetime.utcnow().timestamp()):
diff_seconds = int(now) - int(timestamp)
# assert diff_seconds >= 0, (int(timestamp), int(now))
return f"{int(diff_seconds / DAYSECONDS):2.0f}d"
def K(size):
if size < 1000:
return f"{size:5.0f}"
return f"{int(size/1000):5.0f}K"
def M(size):
return f"{int(size/1000000):5.0f}M"
def H(size):
if size < 1000 * 1000:
return K(size)
if size < 1000 * 1000 * 1000:
return M(size)
return f"{size/1000000000:2.2f}G"
class Report:
def __init__(self, stats, now):
self.sum_extra = 0
self.sum_all_messages = 0
self.messages = []
self.user_logins = []
self.ci_logins = []
self.stats = stats
self.now = now
for mailbox in stats.mailboxes:
last_login = mailbox.last_login
if last_login:
if os.path.basename(mailbox.mailboxdir)[:3] == "ci-":
self.ci_logins.append(last_login)
else:
self.user_logins.append(last_login)
for entry in mailbox.messages:
new = FileEntry(
relpath=joinpath(
os.path.basename(mailbox.mailboxdir), entry.relpath
),
mtime=entry.mtime,
size=entry.size,
)
self.messages.append(new)
self.sum_all_messages += entry.size
for entry in mailbox.extrafiles:
self.sum_extra += entry.size
def dump_summary(self):
reports = []
def print_messages(title, messages, num, rep=True):
print()
allsize = sum(x.size for x in messages)
if rep:
reports.append((title, allsize))
print(f"## {title} [total: {H(allsize)}]")
for entry in messages[:num]:
print(f"{K(entry.size)} {D(entry.mtime)} {entry.relpath}")
for kind in ("cur", "new"):
biggest = list(self.messages)
biggest.sort(key=lambda x: (-x.size, x.mtime))
print_messages(f"Biggest {kind} messages", biggest, 10, rep=False)
oldest = self.messages
mode = "cur"
for maxsize in (160000, 500000, 2000000, 10000000):
oldest = [x for x in oldest if x.size > maxsize and mode in x.relpath]
oldest.sort(key=lambda x: x.mtime)
print_messages(f"{mode} folders oldest > {K(maxsize)} messages", oldest, 10)
# list all 160K files of people who haven't logged in for a while
messages = []
cutoff_date_login = self.now - 30 * DAYSECONDS
for mstat in self.stats.mailboxes:
if mstat.last_login and mstat.last_login < cutoff_date_login:
for msg in mstat.messages:
if msg.size > 160000:
messages.append(msg)
messages.sort(key=lambda x: x.size)
print_messages(">30-day last_login new >160K", messages, 10)
print()
print("## Overall mailbox storage use analysis")
print(f"Mailbox data: {M(self.sum_extra + self.sum_all_messages)}")
print(f"Messages : {M(self.sum_all_messages)}")
percent = self.sum_extra / (self.sum_extra + self.sum_all_messages) * 100
print(f"Extra files : {M(self.sum_extra)} ({percent:.2f}%)")
for title, size in reports:
percent = size / self.sum_all_messages * 100
print(f"{title:38} {M(size)} ({percent:.2f}%)")
all_logins = len(self.user_logins) + len(self.ci_logins)
num_logins = len(self.user_logins)
ci_logins = len(self.ci_logins)
def p(num):
return f"({num/num_logins * 100:2.2f}%)"
print()
print(f"## Login stats, from date reference {datetime.fromtimestamp(self.now)}")
print(f"all: {K(all_logins)}")
print(f"non-ci: {K(num_logins)}")
print(f"ci: {K(ci_logins)}")
for days in (1, 10, 30, 40, 80, 100, 150):
active = len(
[x for x in self.user_logins if x >= self.now - days * DAYSECONDS]
)
print(f"last {days:3} days: {K(active)} {p(active)}")
def run_report(config, basedir, maxnum=None, now=None):
stats = Stats(basedir, maxnum=maxnum)
stats.iter_mailboxes()
rep = Report(stats, now=now)
rep.dump_summary()
def main():
cfgpath, basedir, maxnum = sys.argv[1:]
config = read_config(cfgpath)
now = datetime.utcnow().timestamp()
now = datetime(2025, 9, 9).timestamp()
run_report(config, basedir, maxnum=int(maxnum), now=now)
if __name__ == "__main__":
main()

View File

@@ -1,4 +1,14 @@
from chatmaild.expire import MailboxStat import random
from chatmaild.expire import FileEntry, MailboxStat
def test_filentry_ordering():
l = [FileEntry(f"x{i}", size=i + 10, mtime=1000 - i) for i in range(10)]
sorted = list(l)
random.shuffle(l)
l.sort(key=lambda x: x.size)
assert l == sorted
def test_stats_mailbox(tmp_path): def test_stats_mailbox(tmp_path):
@@ -22,18 +32,15 @@ def test_stats_mailbox(tmp_path):
assert mbox.last_login == password.stat().st_mtime assert mbox.last_login == password.stat().st_mtime
assert len(mbox.messages) == 2 assert len(mbox.messages) == 2
seen = mbox.get_messages("cur") msgs = list(mbox.messages)
assert len(seen) == 1 assert len(msgs) == 2
assert seen[0].size == 3 assert msgs[0].size == 3 # cur
new = mbox.get_messages("new") assert msgs[1].size == 6 # new
assert len(new) == 1
assert new[0].size == 6
extra = mailboxdir.joinpath("large") extra = mailboxdir.joinpath("large")
extra.write_text("x" * 1000) extra.write_text("x" * 1000)
mailboxdir.joinpath("index-something").write_text("123") mailboxdir.joinpath("index-something").write_text("123")
mbox = MailboxStat(tmp_path) mbox = MailboxStat(tmp_path)
extrafiles = mbox.get_extra_files() assert len(mbox.extrafiles) == 3
assert len(extrafiles) == 3 assert mbox.extrafiles[0].size == 1000
assert extrafiles[0].size == 1000