check_bogus.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536
  1. #!/usr/bin/env python3
  2. """Check all curated TLDs in violator.conf against Lists.toml all section."""
  3. import re
  4. # Parse all TLDs from Lists.toml
  5. with open('scripts/violator-workdir/Lists.toml') as f:
  6. toml = f.read()
  7. m = re.search(r'^all = \[(.*?)\]', toml, re.DOTALL | re.MULTILINE)
  8. all_tlds = set(re.findall(r'"([a-z][a-z0-9]*)', m.group(1)))
  9. # Parse violator.conf
  10. with open('scripts/violator.conf') as f:
  11. conf = f.read()
  12. # Find all curated list sections and their tlds
  13. sections = re.findall(r'\[list\.(\w+)\].*?type\s*=\s*curated.*?tlds\s*=\s*(.*?)(?=\n\[|\Z)', conf, re.DOTALL)
  14. bogus = {}
  15. for name, tlds_block in sections:
  16. words = re.findall(r'[a-z][a-z0-9]*', tlds_block)
  17. # skip config keywords
  18. keywords = {'type', 'curated', 'description', 'tlds', 'min', 'max', 'filter', 'all'}
  19. for w in words:
  20. if w not in all_tlds and w not in keywords:
  21. bogus.setdefault(name, []).append(w)
  22. if not bogus:
  23. print("All curated TLDs exist in the all list!")
  24. else:
  25. total = 0
  26. for name, bads in bogus.items():
  27. print(f"\n[list.{name}] — {len(bads)} fake TLDs:")
  28. print(" " + " ".join(bads))
  29. total += len(bads)
  30. print(f"\nTotal: {total} TLDs to remove")