diff --git a/fs_attachment/__manifest__.py b/fs_attachment/__manifest__.py index bb8fe1ca10..61012c7d80 100644 --- a/fs_attachment/__manifest__.py +++ b/fs_attachment/__manifest__.py @@ -5,7 +5,7 @@ { "name": "Base Attachment Object Store", "summary": "Store attachments on external object store", - "version": "16.0.2.0.1", + "version": "16.0.2.0.3", "author": "Camptocamp, ACSONE SA/NV, Odoo Community Association (OCA)", "license": "AGPL-3", "development_status": "Beta", diff --git a/fs_attachment/models/fs_file_gc.py b/fs_attachment/models/fs_file_gc.py index 6ab70ec38e..b44d6971f5 100644 --- a/fs_attachment/models/fs_file_gc.py +++ b/fs_attachment/models/fs_file_gc.py @@ -1,5 +1,6 @@ # Copyright 2023 ACSONE SA/NV # License AGPL-3.0 or later (https://www.gnu.org/licenses/agpl). +import gc import logging import threading from contextlib import closing, contextmanager @@ -118,6 +119,15 @@ def _gc_files(self) -> None: # commit to release the lock cr.commit() # pylint: disable=invalid-commit + # Max orphan files processed per batch. Bounds the per-run memory + # footprint of the autovacuum job when many orphans are queued: + # fsspec backend clients (adlfs, s3fs, ...) keep response buffers + # and connection-pool state alive until their referents are + # collected. Loading tens of thousands of fnames into a single + # Python list and iterating ``fs.rm`` in one pass was enough to + # hit Odoo's ``limit_memory_hard`` on production workers. + _GC_BATCH_SIZE = 500 + def _gc_files_unsafe(self) -> None: # get the list of fs.storage codes that must be autovacuumed codes = ( @@ -125,44 +135,50 @@ def _gc_files_unsafe(self) -> None: ) if not codes: return - # we process by batch of storage codes. - self._cr.execute( - """ - SELECT - fs_storage_code, - array_agg(store_fname) - - FROM - fs_file_gc - WHERE - fs_storage_code IN %s - AND NOT EXISTS ( - SELECT 1 - FROM ir_attachment - WHERE store_fname = fs_file_gc.store_fname - ) - GROUP BY - fs_storage_code - """, - (tuple(codes),), - ) - for code, store_fnames in self._cr.fetchall(): + # Process one storage at a time and paginate both the SELECT and + # the fs.rm loop so neither the Python list of file names nor + # the storage SDK's response buffers can grow unbounded. + for code in codes: self.env["fs.storage"].get_by_code(code) fs = self.env["fs.storage"].get_fs_by_code(code) - for store_fname in store_fnames: - try: - file_path = store_fname.partition("://")[2] - fs.rm(file_path) - except Exception: - _logger.debug("Failed to remove file %s", store_fname) - - # delete the records from the table fs_file_gc - self._cr.execute( - """ - DELETE FROM - fs_file_gc - WHERE - fs_storage_code IN %s - """, - (tuple(codes),), - ) + while True: + self._cr.execute( + """ + SELECT store_fname + FROM fs_file_gc + WHERE fs_storage_code = %s + AND NOT EXISTS ( + SELECT 1 + FROM ir_attachment + WHERE store_fname = fs_file_gc.store_fname + ) + LIMIT %s + """, + (code, self._GC_BATCH_SIZE), + ) + rows = self._cr.fetchall() + if not rows: + break + fnames = [row[0] for row in rows] + for store_fname in fnames: + try: + file_path = store_fname.partition("://")[2] + fs.rm(file_path) + except Exception: + _logger.debug("Failed to remove file %s", store_fname) + # Always clear this batch from fs_file_gc — fs.rm failures + # leak the blob in the backend (same behaviour as the + # pre-batching implementation) but the DB row must go or + # the next SELECT would re-fetch the same rows and the + # loop would never terminate. + self._cr.execute( + "DELETE FROM fs_file_gc WHERE store_fname = ANY(%s)", + (fnames,), + ) + # Force a collection between batches to reclaim response + # buffers and connection objects held by the storage SDK + # that would otherwise only be freed on worker exit. Do + # NOT commit here: the caller (_gc_files) holds a SHARE + # lock on fs_file_gc and ir_attachment for consistency + # and commits at the end. + gc.collect() diff --git a/fs_attachment/readme/newsfragments/gc_batching.bugfix b/fs_attachment/readme/newsfragments/gc_batching.bugfix new file mode 100644 index 0000000000..d94d2bc95c --- /dev/null +++ b/fs_attachment/readme/newsfragments/gc_batching.bugfix @@ -0,0 +1,16 @@ +Paginate the autovacuum GC loop to bound worker memory. + +``FsFileGC._gc_files_unsafe`` used to load the entire backlog of orphan +files into a single Python list via ``array_agg(store_fname)`` and iterate +``fs.rm`` over all of them in one pass. With the Azure Blob backend and +tens of thousands of orphans queued, each HEAD+DELETE pair retained +response buffers and connection-pool state inside the adlfs client that +was only released when the worker exited. The autovacuum cron hit Odoo's +``limit_memory_hard`` and got ``SIGKILL``'d mid-run every time, so the +queue never drained and the next worker ran the same failing loop. + +The SELECT and the ``fs.rm`` loop are now paginated in batches of 500 per +storage, with an explicit ``gc.collect()`` between batches. The caller +(``_gc_files``) still holds the ``SHARE`` lock and performs the final +commit, so the consistency guarantees and transactional semantics are +unchanged.