diff --git a/chatmaild/pyproject.toml b/chatmaild/pyproject.toml index 8097c8df..d3056da8 100644 --- a/chatmaild/pyproject.toml +++ b/chatmaild/pyproject.toml @@ -28,6 +28,7 @@ filtermail = "chatmaild.filtermail:main" echobot = "chatmaild.echo:main" chatmail-metrics = "chatmaild.metrics:main" expire = "chatmaild.expire:main" +fsreport = "chatmaild.fsreport:main" lastlogin = "chatmaild.lastlogin:main" turnserver = "chatmaild.turnserver:main" diff --git a/chatmaild/src/chatmaild/expire.py b/chatmaild/src/chatmaild/expire.py index 01b375f0..341cfbae 100644 --- a/chatmaild/src/chatmaild/expire.py +++ b/chatmaild/src/chatmaild/expire.py @@ -1,42 +1,47 @@ +""" +Expire old messages and addresses. + +""" + import os import shutil import sys -import time -from collections import namedtuple from datetime import datetime from stat import S_ISREG from chatmaild.config import read_config -# delete already seen big mails after 7 days, in the INBOX -# 2 0 * * * vmail find {{ config.mailboxes_dir }} -path '*/cur/*' -mtime +{{ config.delete_large_after }} -size +200k -type f -delete -# # delete all mails after {{ config.delete_mails_after }} days, in the Inbox -# 3 0 * * * vmail find {{ config.mailboxes_dir }} -name 'maildirsize' -type f -delete +# XXX maildirsize (used by dovecot quota) needs to be removed after removing files -FileEntry = namedtuple("FileEntry", ["relpath", "mtime", "size"]) -dayseconds = 24 * 60 * 60 -monthseconds = dayseconds * 30 +class FileEntry: + def __init__(self, relpath, mtime, size): + self.relpath = relpath + self.mtime = mtime + self.size = size + + def __repr__(self): + return f"" + + def fmt_size(self): + return f"{int(self.size/1000):5.0f}K" + + def fmt_since(self, now): + diff_seconds = int(now) - int(self.mtime) + return f"{int(diff_seconds / 86400):2.0f}d" + + def __eq__(self, other): + return ( + self.relpath == other.relpath + and self.size == other.size + and self.mtime == other.mtime + ) def joinpath(name, extra): return name + "/" + extra -def D(timestamp, now=datetime.utcnow().timestamp()): - diff_seconds = int(now) - int(timestamp) - # assert diff_seconds >= 0, (int(timestamp), int(now)) - return f"{int(diff_seconds / dayseconds):2.0f}d" - - -def K(size): - return f"{int(size/1000):6.0f}K" - - -def M(size): - return f"{int(size/1000000):6.0f}M" - - class Stats: def __init__(self, basedir, maxnum=None): self.basedir = str(basedir) @@ -53,9 +58,16 @@ class Stats: class MailboxStat: def __init__(self, mailboxdir): self.mailboxdir = mailboxdir = str(mailboxdir) + # all detected messages in cur/new/tmp folders self.messages = [] + + # all detected files in mailbox top dir self.extrafiles = [] + # total size of all detected files + self.totalsize = 0 + + # scan all relevant files (without recursion) for name in os.listdir(mailboxdir): fpath = joinpath(mailboxdir, name) if name in ("cur", "new", "tmp"): @@ -66,11 +78,13 @@ class MailboxStat: self.messages.append( FileEntry(relpath, mtime=st.st_mtime, size=st.st_size) ) + self.totalsize += st.st_size else: st = os.stat(fpath) if S_ISREG(st.st_mode): self.extrafiles.append(FileEntry(name, st.st_mtime, st.st_size)) - self.extrafiles.sort(key=lambda x: x.size, reverse=True) + self.totalsize += st.st_size + self.extrafiles.sort(key=lambda x: -x.size) @property def last_login(self): @@ -78,101 +92,8 @@ class MailboxStat: if entry.relpath == "password": return entry.mtime - def get_messages(self, prefix=""): - l = [] - for entry in self.messages: - if entry.relpath.startswith(prefix): - l.append(entry) - return l - - def get_extra_files(self): - return list(self.extrafiles) - - def get_file_entry(self, name): - for entry in self.extrafiles: - if name == entry.relapth: - return entry - - -class XXXStats: - def __init__(self): - self.sum_extra = 0 - self.sum_all_messages = 0 - self.logins = [] - self.messages = [] - - def analyze(self, statscache): - print("start") - for mailbox in statscache.cache: - mbox_cache = statscache.cache[mailbox] - if "password" not in mbox_cache: - continue - self.logins.append(mbox_cache["password"][0]) - for relpath, (mtime, size) in mbox_cache.items(): - if relpath[:4] in ("cur/", "new/", "tmp/"): - self.sum_all_messages += size - entry = FileEntry(relpath=relpath, mtime=mtime, size=size) - self.messages.append(entry) - else: - self.sum_extra += size - - def dump_summary(self): - now = datetime.utcnow().timestamp() - - print(f"size of everything: {M(self.sum_extra + self.sum_all_messages)}") - print(f"size all messages: {M(self.sum_all_messages)}") - percent = self.sum_extra / (self.sum_extra + self.sum_all_messages) * 100 - print(f"size extra files: {M(self.sum_extra)} ({percent:.2f}%)") - for size in (100000, 200000, 500000, 1000000, 5000000): - all_of_size = sum( - x.size - for x in self.messages - if x.size > size and x.relpath.startswith("cur") - ) - percent = all_of_size / self.sum_all_messages * 100 - print(f"size seen {K(size)} messages: {M(all_of_size)} ({percent:.2f}%)") - for size in (100000, 200000, 500000, 1000000, 5000000): - all_of_size = sum( - x.size - for x in self.messages - if x.size > size and x.mtime < now - 2 * dayseconds - ) - percent = all_of_size / self.sum_all_messages * 100 - print( - f"size 2day-old {K(size)} messages: {M(all_of_size)} ({percent:.2f}%)" - ) - for size in (100000, 200000, 500000, 1000000, 5000000): - all_of_size = sum( - x.size - for x in self.messages - if x.size > size - and x.relpath.startswith("cur") - and x.mtime < now - 7 * dayseconds - ) - percent = all_of_size / self.sum_all_messages * 100 - print( - f"size seen 7-day old {K(size)} messages: {M(all_of_size)} ({percent:.2f}%)" - ) - - print() - - num_logins = len(self.logins) - monthly_active = len([x for x in self.logins if x >= now - monthseconds]) - daily_active = len([x for x in self.logins if x >= now - dayseconds]) - stale = num_logins - monthly_active - - def p(num): - return f"({num/num_logins * 100:.2f}%)" - - print(f"all logins: {K(num_logins)}") - print(f"monthly active: {K(monthly_active)} {p(monthly_active)}") - print(f">1m old logins: {K(stale)} {p(stale)}") - print(f"daily active: {K(daily_active)} {p(daily_active)}") - - -def run_expire(config, basedir, dry=False, maxnum=None): - now = time.time() +def run_expire(config, basedir, now, dry=True, maxnum=None): stat = Stats(basedir, maxnum=maxnum) stat.iter_mailboxes() cutoff_date_without_login = now - int(config.delete_inactive_users_after) * 86400 @@ -188,7 +109,9 @@ def run_expire(config, basedir, dry=False, maxnum=None): def unlink(mailboxdir, message): if dry: relpath = os.path.basename(mailboxdir) + message.relpath - print(f"would remove {D(message.mtime)} {K(message.size)} {relpath}") + print( + f"would remove {message.fmt_since(now)} {message.fmt_size()} {relpath}" + ) else: os.unlink(path) @@ -217,7 +140,9 @@ def run_expire(config, basedir, dry=False, maxnum=None): def main(): cfgpath, basedir, maxnum = sys.argv[1:] config = read_config(cfgpath) - run_expire(config, basedir, dry=True, maxnum=int(maxnum)) + now = datetime.utcnow().timestamp() + now = datetime(2025, 9, 9).timestamp() + run_expire(config, basedir, maxnum=int(maxnum), now=now) if __name__ == "__main__": diff --git a/chatmaild/src/chatmaild/fsreport.py b/chatmaild/src/chatmaild/fsreport.py new file mode 100644 index 00000000..91a59302 --- /dev/null +++ b/chatmaild/src/chatmaild/fsreport.py @@ -0,0 +1,150 @@ +import os +import sys +from datetime import datetime + +from chatmaild.config import read_config +from chatmaild.expire import FileEntry, Stats, joinpath + +DAYSECONDS = 24 * 60 * 60 +MONTHSECONDS = DAYSECONDS * 30 + + +def D(timestamp, now=datetime.utcnow().timestamp()): + diff_seconds = int(now) - int(timestamp) + # assert diff_seconds >= 0, (int(timestamp), int(now)) + return f"{int(diff_seconds / DAYSECONDS):2.0f}d" + + +def K(size): + if size < 1000: + return f"{size:5.0f}" + return f"{int(size/1000):5.0f}K" + + +def M(size): + return f"{int(size/1000000):5.0f}M" + + +def H(size): + if size < 1000 * 1000: + return K(size) + if size < 1000 * 1000 * 1000: + return M(size) + return f"{size/1000000000:2.2f}G" + + +class Report: + def __init__(self, stats, now): + self.sum_extra = 0 + self.sum_all_messages = 0 + self.messages = [] + self.user_logins = [] + self.ci_logins = [] + self.stats = stats + self.now = now + + for mailbox in stats.mailboxes: + last_login = mailbox.last_login + if last_login: + if os.path.basename(mailbox.mailboxdir)[:3] == "ci-": + self.ci_logins.append(last_login) + else: + self.user_logins.append(last_login) + for entry in mailbox.messages: + new = FileEntry( + relpath=joinpath( + os.path.basename(mailbox.mailboxdir), entry.relpath + ), + mtime=entry.mtime, + size=entry.size, + ) + self.messages.append(new) + self.sum_all_messages += entry.size + + for entry in mailbox.extrafiles: + self.sum_extra += entry.size + + def dump_summary(self): + reports = [] + + def print_messages(title, messages, num, rep=True): + print() + allsize = sum(x.size for x in messages) + if rep: + reports.append((title, allsize)) + + print(f"## {title} [total: {H(allsize)}]") + for entry in messages[:num]: + print(f"{K(entry.size)} {D(entry.mtime)} {entry.relpath}") + + for kind in ("cur", "new"): + biggest = list(self.messages) + biggest.sort(key=lambda x: (-x.size, x.mtime)) + print_messages(f"Biggest {kind} messages", biggest, 10, rep=False) + + oldest = self.messages + mode = "cur" + for maxsize in (160000, 500000, 2000000, 10000000): + oldest = [x for x in oldest if x.size > maxsize and mode in x.relpath] + oldest.sort(key=lambda x: x.mtime) + print_messages(f"{mode} folders oldest > {K(maxsize)} messages", oldest, 10) + + # list all 160K files of people who haven't logged in for a while + messages = [] + cutoff_date_login = self.now - 30 * DAYSECONDS + for mstat in self.stats.mailboxes: + if mstat.last_login and mstat.last_login < cutoff_date_login: + for msg in mstat.messages: + if msg.size > 160000: + messages.append(msg) + + messages.sort(key=lambda x: x.size) + print_messages(">30-day last_login new >160K", messages, 10) + + print() + print("## Overall mailbox storage use analysis") + print(f"Mailbox data: {M(self.sum_extra + self.sum_all_messages)}") + print(f"Messages : {M(self.sum_all_messages)}") + percent = self.sum_extra / (self.sum_extra + self.sum_all_messages) * 100 + print(f"Extra files : {M(self.sum_extra)} ({percent:.2f}%)") + + for title, size in reports: + percent = size / self.sum_all_messages * 100 + print(f"{title:38} {M(size)} ({percent:.2f}%)") + + all_logins = len(self.user_logins) + len(self.ci_logins) + num_logins = len(self.user_logins) + ci_logins = len(self.ci_logins) + + def p(num): + return f"({num/num_logins * 100:2.2f}%)" + + print() + print(f"## Login stats, from date reference {datetime.fromtimestamp(self.now)}") + print(f"all: {K(all_logins)}") + print(f"non-ci: {K(num_logins)}") + print(f"ci: {K(ci_logins)}") + for days in (1, 10, 30, 40, 80, 100, 150): + active = len( + [x for x in self.user_logins if x >= self.now - days * DAYSECONDS] + ) + print(f"last {days:3} days: {K(active)} {p(active)}") + + +def run_report(config, basedir, maxnum=None, now=None): + stats = Stats(basedir, maxnum=maxnum) + stats.iter_mailboxes() + rep = Report(stats, now=now) + rep.dump_summary() + + +def main(): + cfgpath, basedir, maxnum = sys.argv[1:] + config = read_config(cfgpath) + now = datetime.utcnow().timestamp() + now = datetime(2025, 9, 9).timestamp() + run_report(config, basedir, maxnum=int(maxnum), now=now) + + +if __name__ == "__main__": + main() diff --git a/chatmaild/src/chatmaild/tests/test_expire.py b/chatmaild/src/chatmaild/tests/test_expire.py index a2a40a57..3b290a79 100644 --- a/chatmaild/src/chatmaild/tests/test_expire.py +++ b/chatmaild/src/chatmaild/tests/test_expire.py @@ -1,4 +1,14 @@ -from chatmaild.expire import MailboxStat +import random + +from chatmaild.expire import FileEntry, MailboxStat + + +def test_filentry_ordering(): + l = [FileEntry(f"x{i}", size=i + 10, mtime=1000 - i) for i in range(10)] + sorted = list(l) + random.shuffle(l) + l.sort(key=lambda x: x.size) + assert l == sorted def test_stats_mailbox(tmp_path): @@ -22,18 +32,15 @@ def test_stats_mailbox(tmp_path): assert mbox.last_login == password.stat().st_mtime assert len(mbox.messages) == 2 - seen = mbox.get_messages("cur") - assert len(seen) == 1 - assert seen[0].size == 3 + msgs = list(mbox.messages) + assert len(msgs) == 2 + assert msgs[0].size == 3 # cur - new = mbox.get_messages("new") - assert len(new) == 1 - assert new[0].size == 6 + assert msgs[1].size == 6 # new extra = mailboxdir.joinpath("large") extra.write_text("x" * 1000) mailboxdir.joinpath("index-something").write_text("123") mbox = MailboxStat(tmp_path) - extrafiles = mbox.get_extra_files() - assert len(extrafiles) == 3 - assert extrafiles[0].size == 1000 + assert len(mbox.extrafiles) == 3 + assert mbox.extrafiles[0].size == 1000