Coverage for apps / recipes / services / image_cache.py: 76%
143 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-12 10:49 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-12 10:49 +0000
1"""
2Search result image caching service for iOS 9 compatibility.
4Implements fire-and-forget batch downloads to cache external recipe images
5locally, avoiding CORS and security issues on older Safari browsers.
6"""
8import asyncio
9import hashlib
10import io
11import logging
13from asgiref.sync import sync_to_async
14from curl_cffi.requests import AsyncSession
15from django.core.files.base import ContentFile
16from PIL import Image
18from apps.core.validators import (
19 MAX_IMAGE_SIZE,
20 MAX_REDIRECT_HOPS,
21 check_response_size,
22 validate_redirect_url,
23 validate_url,
24)
25from apps.recipes.services.fingerprint import BROWSER_PROFILES
27# Limit decompression bomb attacks via PIL
28Image.MAX_IMAGE_PIXELS = 178_956_970
30logger = logging.getLogger(__name__)
33class SearchImageCache:
34 """
35 Service for caching search result images to local storage.
37 Enables iOS 9 Safari compatibility by downloading external recipe images
38 to the server immediately (fire-and-forget), then returning local URLs
39 that don't trigger CORS restrictions.
41 Browser profiles are centralized in fingerprint.py for maintainability.
42 """
44 MAX_CONCURRENT = 5
45 DOWNLOAD_TIMEOUT = 15
47 async def cache_images(self, image_urls: list) -> None:
48 """
49 Fire-and-forget batch download of search result images.
51 Args:
52 image_urls: List of external image URLs to cache
54 Returns:
55 None (errors logged but not raised)
56 """
57 if not image_urls:
58 return
60 # Create semaphore to limit concurrent downloads
61 semaphore = asyncio.Semaphore(self.MAX_CONCURRENT)
63 # Create download tasks
64 tasks = [self._download_and_save(None, semaphore, url) for url in image_urls]
66 # Run concurrently without awaiting completion
67 if tasks:
68 await asyncio.gather(*tasks, return_exceptions=True)
70 async def _download_and_save(self, session: AsyncSession, semaphore: asyncio.Semaphore, url: str) -> None:
71 """
72 Download and cache a single image with status tracking.
74 Args:
75 session: AsyncSession (can be None, will create if needed)
76 semaphore: Semaphore to limit concurrent downloads
77 url: External image URL to cache
78 """
79 # Import here to avoid circular imports
80 from apps.recipes.models import CachedSearchImage
82 async with semaphore:
83 try:
84 # Get or create cache record
85 cached, created = await sync_to_async(CachedSearchImage.objects.get_or_create)(
86 external_url=url, defaults={"status": CachedSearchImage.STATUS_PENDING}
87 )
89 # Skip if already successfully cached
90 if cached.status == CachedSearchImage.STATUS_SUCCESS and cached.image:
91 return
93 # Download image
94 image_data = await self._fetch_image(url)
95 if not image_data:
96 cached.status = CachedSearchImage.STATUS_FAILED
97 await sync_to_async(cached.save)(update_fields=["status"])
98 return
100 # Convert to JPEG for iOS 9 compatibility (no WebP support)
101 converted_data = self._convert_to_jpeg(image_data)
102 if not converted_data:
103 cached.status = CachedSearchImage.STATUS_FAILED
104 await sync_to_async(cached.save)(update_fields=["status"])
105 return
107 # Generate filename and save
108 filename = self._generate_filename(url)
109 cached.image = ContentFile(converted_data, name=filename)
110 cached.status = CachedSearchImage.STATUS_SUCCESS
111 await sync_to_async(cached.save)(update_fields=["image", "status"])
112 logger.info("Cached 1 search image")
113 logger.debug("Cached image from %s", url)
115 except Exception as e:
116 logger.error("Failed to cache search image: %s", e)
117 logger.debug("Failed image URL: %s", url)
118 # Try to mark as failed if we have a record
119 try:
120 from apps.recipes.models import CachedSearchImage
122 cached = await sync_to_async(CachedSearchImage.objects.get)(external_url=url)
123 cached.status = CachedSearchImage.STATUS_FAILED
124 await sync_to_async(cached.save)(update_fields=["status"])
125 except Exception:
126 logger.warning("Failed to mark cached image as failed for %s", url, exc_info=True)
128 async def _fetch_image(self, url: str) -> bytes | None:
129 """
130 Fetch image content from URL with browser profile fallback.
132 Tries multiple browser profiles if initial request fails.
133 Browser profiles are configured in fingerprint.py.
135 Args:
136 url: Image URL to fetch
138 Returns:
139 Image bytes or None if fetch fails
140 """
141 # Validate URL for SSRF protection (returns pinned DNS resolution)
142 try:
143 resolved = validate_url(url)
144 except ValueError:
145 logger.warning(f"Blocked image URL (SSRF): {url}")
146 return None
148 # Try each browser profile with manual redirect following
149 for profile in BROWSER_PROFILES:
150 try:
151 content = await self._fetch_image_safe(url, profile, resolved.curl_resolve)
152 if content is not None:
153 return content
154 except Exception as e:
155 logger.debug(f"Failed to fetch image {url} with {profile}: {e}")
156 continue
158 return None
160 async def _fetch_image_safe(self, url, profile, curl_resolve=None):
161 """Fetch image following redirects with per-hop SSRF validation and DNS pinning."""
162 from curl_cffi import CurlOpt
164 current_url = url
165 current_resolve = curl_resolve or []
166 for _ in range(MAX_REDIRECT_HOPS):
167 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {}
168 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session:
169 response = await session.get(
170 current_url,
171 timeout=self.DOWNLOAD_TIMEOUT,
172 allow_redirects=False,
173 )
175 if response.status_code in (301, 302, 303, 307, 308):
176 location = response.headers.get("location")
177 if not location:
178 return None
179 try:
180 resolved = validate_redirect_url(location)
181 except ValueError:
182 return None
183 current_url = location
184 current_resolve = resolved.curl_resolve
185 continue
187 if response.status_code in (404, 410):
188 return None
190 if response.status_code == 200 and response.content:
191 if not check_response_size(response, MAX_IMAGE_SIZE):
192 logger.warning("Image too large: %s", current_url)
193 return None
194 if len(response.content) > MAX_IMAGE_SIZE:
195 return None
196 content_type = response.headers.get("content-type", "")
197 if "image" in content_type:
198 return response.content
199 if self._looks_like_image(response.content):
200 return response.content
202 return None
204 logger.warning("Too many redirects for image: %s", url)
205 return None
207 @staticmethod
208 def _looks_like_image(data: bytes) -> bool:
209 """Check if bytes look like an image by inspecting magic bytes."""
210 if len(data) < 4:
211 return False
212 # JPEG, PNG, GIF, WebP magic bytes
213 return data[:2] == b"\xff\xd8" or data[:4] == b"\x89PNG" or data[:4] == b"GIF8" or data[:4] == b"RIFF"
215 async def get_cached_urls_batch(self, urls: list) -> dict:
216 """
217 Batch lookup of cached image URLs for API response.
219 Args:
220 urls: List of external image URLs to check
222 Returns:
223 Dict mapping external_url → cached_image_url (or None if not cached)
224 """
225 if not urls:
226 return {}
228 # Import here to avoid circular imports
229 from apps.recipes.models import CachedSearchImage
231 # Query all at once
232 cached_images = await sync_to_async(
233 lambda: list(
234 CachedSearchImage.objects.filter(
235 external_url__in=urls,
236 status=CachedSearchImage.STATUS_SUCCESS,
237 image__isnull=False,
238 ).exclude(image="")
239 )
240 )()
242 # Build result dict
243 result = {}
244 for cached in cached_images:
245 if cached.image:
246 result[cached.external_url] = cached.image.url
247 # Update access time without saving to DB unnecessarily
248 # (updated via auto_now on next modification)
250 return result
252 def _generate_filename(self, image_url: str) -> str:
253 """
254 Generate unique hash-based filename for cached image.
256 Always uses .jpg extension since all images are converted to JPEG.
258 Args:
259 image_url: External image URL
261 Returns:
262 Filename like 'search_{hash}.jpg'
263 """
264 url_hash = hashlib.md5(image_url.encode(), usedforsecurity=False).hexdigest()[:12]
265 return f"search_{url_hash}.jpg"
267 def _convert_to_jpeg(self, image_data: bytes) -> bytes | None:
268 """
269 Convert image to JPEG format for iOS 9 compatibility.
271 iOS 9 Safari doesn't support WebP (added in Safari 14/iOS 14).
272 This converts any image format (WebP, PNG, etc.) to JPEG.
274 Args:
275 image_data: Raw image bytes in any format
277 Returns:
278 JPEG image bytes, or None if conversion fails
279 """
280 if len(image_data) > MAX_IMAGE_SIZE:
281 logger.warning("Image data too large for processing: %d bytes", len(image_data))
282 return None
284 try:
285 # Open image from bytes (MAX_IMAGE_PIXELS protects against decompression bombs)
286 img = Image.open(io.BytesIO(image_data))
288 # Convert RGBA to RGB (JPEG doesn't support transparency)
289 if img.mode in ("RGBA", "LA", "P"):
290 # Create white background
291 background = Image.new("RGB", img.size, (255, 255, 255))
292 if img.mode == "P":
293 img = img.convert("RGBA")
294 background.paste(img, mask=img.split()[-1] if img.mode in ("RGBA", "LA") else None)
295 img = background
296 elif img.mode != "RGB":
297 img = img.convert("RGB")
299 # Save as JPEG
300 output = io.BytesIO()
301 img.save(output, format="JPEG", quality=92, optimize=True)
302 return output.getvalue()
304 except Exception as e:
305 logger.error(f"Failed to convert image to JPEG: {e}")
306 return None