1
0

cleanmedia 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313
  1. #!/usr/bin/env python3
  2. # This program is free software: you can redistribute it and/or modify
  3. # it under the terms of the GNU General Public License as published by
  4. # the Free Software Foundation, either version 3 of the License, or
  5. # (at your option) any later version.
  6. #
  7. # This program is distributed in the hope that it will be useful,
  8. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. # GNU General Public License for more details.
  11. #
  12. # You should have received a copy of the GNU General Public License
  13. # along with this program. If not, see <https://www.gnu.org/licenses/>.
  14. import argparse
  15. import logging
  16. from datetime import datetime, timedelta
  17. from functools import cached_property
  18. from pathlib import Path
  19. from typing import Optional, Union, List, Tuple
  20. try:
  21. import psycopg2, psycopg2.extensions # noqa: E401
  22. import yaml
  23. except ImportError:
  24. raise Exception("Please install psycopg2 and pyyaml")
  25. # ------------------------------------------------------------------------
  26. class File:
  27. """A file in our db together with (hopefully) a physical file and thumbnails"""
  28. def __init__(self, media_repo: 'MediaRepository', media_id: str, creation_ts: int, base64hash: str):
  29. # The MediaRepository in which this file is recorded
  30. self.repo = media_repo
  31. self.media_id = media_id
  32. # creation_ts is seconds since the epoch
  33. self.create_date = datetime.fromtimestamp(creation_ts)
  34. self.base64hash = base64hash
  35. @cached_property
  36. def fullpath(self) -> Optional[Path]:
  37. """returns the directory in which the "file" and all thumbnails are located, or None if no file is known"""
  38. if not self.base64hash:
  39. return None
  40. return self.repo.media_path / self.base64hash[0:1] / self.base64hash[1:2] / self.base64hash[2:]
  41. def delete(self) -> bool:
  42. """Delete db entries, and the file itself
  43. :returns: True on successful delete of file,
  44. False or Exception on failure"""
  45. res = True
  46. if self.fullpath is None:
  47. logging.info(f"No known path for file id '{self.media_id}', cannot delete file.")
  48. res = False
  49. elif not self.fullpath.is_dir():
  50. logging.debug(f"Path for file id '{self.media_id}' is not a directory or does not exist, not deleting.")
  51. res = False
  52. else:
  53. for file in self.fullpath.glob('*'):
  54. # note: this does not handle directories in fullpath
  55. file.unlink()
  56. self.fullpath.rmdir()
  57. logging.debug(f"Deleted directory {self.fullpath}")
  58. with self.repo.conn.cursor() as cur:
  59. cur.execute("DELETE from mediaapi_thumbnail WHERE media_id=%s;", (self.media_id,))
  60. num_thumbnails = cur.rowcount
  61. cur.execute("DELETE from mediaapi_media_repository WHERE media_id=%s;", (self.media_id,))
  62. num_media = cur.rowcount
  63. self.repo.conn.commit()
  64. logging.debug(f"Deleted {num_media} + {num_thumbnails} db entries for media id {self.media_id}")
  65. return res
  66. def exists(self) -> bool:
  67. """returns True if the media file itself exists on the file system"""
  68. if self.fullpath is None:
  69. return False
  70. return (self.fullpath / 'file').exists()
  71. def has_thumbnail(self) -> int:
  72. """Returns the number of thumbnails associated with this file"""
  73. with self.repo.conn.cursor() as cur:
  74. cur.execute(f"select COUNT(media_id) from mediaapi_thumbnail WHERE media_id='{self.media_id}';")
  75. row = cur.fetchone()
  76. if row is None:
  77. return 0
  78. return int(row[0])
  79. class MediaRepository:
  80. """A dendrite media repository"""
  81. def __init__(self, media_path: Path, connection_string: str):
  82. self.media_path = media_path
  83. if not self.media_path.is_absolute():
  84. logging.warn("The media path is relative, make sure you run this script in the correct directory!")
  85. if not self.media_path.is_dir():
  86. raise Exception("The configured media dir cannot be found!")
  87. # List of current avatar imgs. init empty
  88. self._avatar_media_ids: List[str] = []
  89. self.db_conn_string = connection_string # psql db connection
  90. self.conn = self.connect_db()
  91. def connect_db(self) -> psycopg2.extensions.connection:
  92. # postgresql://user:pass@localhost/database?params
  93. if self.db_conn_string is None \
  94. or not self.db_conn_string.startswith(("postgres://",
  95. "postgresql://")):
  96. errstr = "DB connection not a postgres one"
  97. logging.error(errstr)
  98. raise ValueError(errstr)
  99. return psycopg2.connect(self.db_conn_string)
  100. def get_single_media(self, mxid: str) -> Optional[File]:
  101. """Return `File` or `None`"""
  102. with self.conn.cursor() as cur:
  103. sql_str = "SELECT media_id, creation_ts, base64hash from mediaapi_media_repository WHERE media_id = %s;"
  104. cur.execute(sql_str, (mxid,))
  105. row = cur.fetchone()
  106. if row is None:
  107. return None
  108. # creation_ts is ms since the epoch, so convert to seconds
  109. return File(self, row[0], row[1] // 1000, row[2])
  110. def get_local_user_media(self, user_id: str) -> List[File]:
  111. """Return all media created by a local user
  112. :params:
  113. :user_id: (`str`) of form "@user:servername.com"
  114. :returns: `List[File]`
  115. """
  116. with self.conn.cursor() as cur:
  117. sql_str = "SELECT media_id, creation_ts, base64hash from mediaapi_media_repository WHERE user_id = %s;"
  118. cur.execute(sql_str, (user_id,))
  119. files = []
  120. for row in cur.fetchall():
  121. # creation_ts is ms since the epoch, so convert to seconds
  122. f = File(self, row[0], row[1] // 1000, row[2])
  123. files.append(f)
  124. return files
  125. def get_all_media(self, local: bool = False) -> List[File]:
  126. """Return List[File] of remote media or ALL media if local==True"""
  127. with self.conn.cursor() as cur:
  128. # media_id | media_origin | content_type | file_size_bytes | creation_ts | upload_name | base64hash | user_id
  129. sql_str = "SELECT media_id, creation_ts, base64hash from mediaapi_media_repository"
  130. if not local:
  131. # only fetch remote media where user_id is empty
  132. sql_str += " WHERE user_id = ''"
  133. sql_str += ";"
  134. cur.execute(sql_str)
  135. files = []
  136. for row in cur.fetchall():
  137. # creation_ts is ms since the epoch, so convert to seconds
  138. f = File(self, row[0], row[1] // 1000, row[2])
  139. files.append(f)
  140. return files
  141. def get_avatar_images(self) -> List[str]:
  142. """Get a list of media_id which are current avatar images
  143. We don't want to clean up those. Save & cache them internally.
  144. """
  145. media_id = []
  146. with self.conn.cursor() as cur:
  147. cur.execute("SELECT avatar_url FROM userapi_profiles WHERE avatar_url > '';")
  148. for row in cur.fetchall():
  149. url = row[0] # mxc://matrix.org/6e627f4c538563
  150. try:
  151. media_id.append(url[url.rindex("/") + 1:])
  152. except ValueError:
  153. logging.warn("No slash in URL '%s'!", url)
  154. self._avatar_media_ids = media_id
  155. return self._avatar_media_ids
  156. def sanity_check_thumbnails(self) -> None:
  157. """Warn if we have thumbnails in the db that do not refer to existing media"""
  158. with self.conn.cursor() as cur:
  159. cur.execute("SELECT COUNT(media_id) from mediaapi_thumbnail WHERE NOT EXISTS (SELECT media_id FROM mediaapi_media_repository);")
  160. row = cur.fetchone()
  161. if row is not None and row[0]:
  162. logging.error("You have {} thumbnails in your db that do not refer to media. This needs fixing (we don't do that)!".format(row[0]))
  163. def clean_media_files(self, days: int, local: bool = False, dryrun: bool = False) -> int:
  164. """Clean out old media files from this repository
  165. :params:
  166. :days: (int) delete media files older than N days.
  167. :local: (bool) Also delete media originating from local users
  168. :dryrun: (bool) Do not actually delete any files (just count)
  169. :returns: (int) The number of files that were/would be deleted
  170. """
  171. if local:
  172. # populate the cache of current avt img. so we don't delete them
  173. mr.get_avatar_images()
  174. cleantime = datetime.today() - timedelta(days=days)
  175. logging.info("Deleting remote media older than %s", cleantime)
  176. num_deleted = 0
  177. files = mr.get_all_media(local)
  178. for file in [f for f in files if f.media_id not in mr._avatar_media_ids]:
  179. if file.create_date < cleantime:
  180. num_deleted += 1
  181. if dryrun: # the great pretender
  182. logging.info(f"Pretending to delete file id {file.media_id} on path {file.fullpath}.")
  183. if not file.exists():
  184. logging.info(f"File id {file.media_id} does not physically exist (path {file.fullpath}).")
  185. else:
  186. file.delete()
  187. info_str = "Deleted %d files during the run."
  188. if dryrun:
  189. info_str = "%d files would have been deleted during the run."
  190. logging.info(info_str, num_deleted)
  191. return num_deleted
  192. # --------------------------------------------------------------
  193. def read_config(conf_file: Union[str, Path]) -> Tuple[Path, str]:
  194. """Read in the dendrite config file and return db creds and media path"""
  195. try:
  196. with open(conf_file) as f:
  197. config = yaml.safe_load(f)
  198. except FileNotFoundError:
  199. errstr = f"Config file {conf_file} not found. Use the --help option to find out more."
  200. logging.error(errstr)
  201. exit(1)
  202. if "media_api" not in config:
  203. logging.error("Missing section media_api")
  204. exit(1)
  205. CONN_STR = None
  206. if "global" in config and "database" in config["global"]:
  207. CONN_STR = config["global"]["database"].get("connection_string", None)
  208. elif "database" in config["media_api"]:
  209. logging.debug("No database section in global, but one in media_api, using that")
  210. CONN_STR = config["media_api"]["database"].get("connection_string", None)
  211. if CONN_STR is None:
  212. logging.error("Did not find connection string to media database.")
  213. exit(1)
  214. BASE_PATH = Path(config["media_api"].get("base_path", None))
  215. if BASE_PATH is None:
  216. logging.error("Missing base_path in media_api")
  217. exit(1)
  218. return (BASE_PATH, CONN_STR)
  219. def parse_options() -> argparse.Namespace:
  220. loglevel = logging.INFO # default logging level
  221. parser = argparse.ArgumentParser(
  222. prog='cleanmedia',
  223. description='Deletes 30 day old remote media files from dendrite servers')
  224. parser.add_argument('-c', '--config', default="config.yaml", help="location of the dendrite.yaml config file.")
  225. parser.add_argument('-m', '--mxid', dest="mxid",
  226. help="Just delete media <MXID>. (no cleanup otherwise)")
  227. parser.add_argument('-u', '--userid', dest="userid",
  228. help="Delete all media by local user '\\@user:domain.com'. (ie, a user on hour homeserver. no cleanup otherwise)")
  229. parser.add_argument('-t', '--days', dest="days",
  230. default="30", type=int,
  231. help="Keep remote media for <DAYS> days.")
  232. parser.add_argument('-l', '--local', action='store_true',
  233. help="Also purge local (ie, from *our* users) media.")
  234. parser.add_argument('-n', '--dryrun', action='store_true',
  235. help="Dry run (don't actually modify any files).")
  236. parser.add_argument('-q', '--quiet', action='store_true', help="Reduce output verbosity.")
  237. parser.add_argument('-d', '--debug', action='store_true', help="Increase output verbosity.")
  238. args: argparse.Namespace = parser.parse_args()
  239. if args.debug:
  240. loglevel = logging.DEBUG
  241. elif args.quiet:
  242. loglevel = logging.WARNING
  243. logging.basicConfig(level=loglevel, format='%(levelname)s - %(message)s')
  244. return args
  245. if __name__ == '__main__':
  246. args = parse_options()
  247. (MEDIA_PATH, CONN_STR) = read_config(args.config)
  248. mr = MediaRepository(MEDIA_PATH, CONN_STR)
  249. if args.mxid:
  250. # Just clean a single media
  251. logging.info("Attempting to delete media '%s'", args.mxid)
  252. file = mr.get_single_media(args.mxid)
  253. if file:
  254. logging.info("Found media with id '%s'", args.mxid)
  255. if not args.dryrun:
  256. file.delete()
  257. elif args.userid:
  258. logging.info("Attempting to delete media by user '%s'", args.userid)
  259. files = mr.get_local_user_media(args.userid)
  260. num_deleted = 0
  261. for file in files:
  262. num_deleted += 1
  263. if args.dryrun: # the great pretender
  264. logging.info(f"Pretending to delete file id {file.media_id} on path {file.fullpath}.")
  265. else:
  266. file.delete()
  267. info_str = "Deleted %d files during the run."
  268. if args.dryrun:
  269. info_str = "%d files would have been deleted during the run."
  270. logging.info(info_str, num_deleted)
  271. else: # main clean out...
  272. # Sanity checks
  273. mr.sanity_check_thumbnails() # warn in case of superfluous thumbnails
  274. mr.clean_media_files(args.days, args.local, args.dryrun)