Coverage for apps / recipes / services / image_cache.py: 76%

143 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-12 10:49 +0000

1""" 

2Search result image caching service for iOS 9 compatibility. 

3 

4Implements fire-and-forget batch downloads to cache external recipe images 

5locally, avoiding CORS and security issues on older Safari browsers. 

6""" 

7 

8import asyncio 

9import hashlib 

10import io 

11import logging 

12 

13from asgiref.sync import sync_to_async 

14from curl_cffi.requests import AsyncSession 

15from django.core.files.base import ContentFile 

16from PIL import Image 

17 

18from apps.core.validators import ( 

19 MAX_IMAGE_SIZE, 

20 MAX_REDIRECT_HOPS, 

21 check_response_size, 

22 validate_redirect_url, 

23 validate_url, 

24) 

25from apps.recipes.services.fingerprint import BROWSER_PROFILES 

26 

27# Limit decompression bomb attacks via PIL 

28Image.MAX_IMAGE_PIXELS = 178_956_970 

29 

30logger = logging.getLogger(__name__) 

31 

32 

33class SearchImageCache: 

34 """ 

35 Service for caching search result images to local storage. 

36 

37 Enables iOS 9 Safari compatibility by downloading external recipe images 

38 to the server immediately (fire-and-forget), then returning local URLs 

39 that don't trigger CORS restrictions. 

40 

41 Browser profiles are centralized in fingerprint.py for maintainability. 

42 """ 

43 

44 MAX_CONCURRENT = 5 

45 DOWNLOAD_TIMEOUT = 15 

46 

47 async def cache_images(self, image_urls: list) -> None: 

48 """ 

49 Fire-and-forget batch download of search result images. 

50 

51 Args: 

52 image_urls: List of external image URLs to cache 

53 

54 Returns: 

55 None (errors logged but not raised) 

56 """ 

57 if not image_urls: 

58 return 

59 

60 # Create semaphore to limit concurrent downloads 

61 semaphore = asyncio.Semaphore(self.MAX_CONCURRENT) 

62 

63 # Create download tasks 

64 tasks = [self._download_and_save(None, semaphore, url) for url in image_urls] 

65 

66 # Run concurrently without awaiting completion 

67 if tasks: 

68 await asyncio.gather(*tasks, return_exceptions=True) 

69 

70 async def _download_and_save(self, session: AsyncSession, semaphore: asyncio.Semaphore, url: str) -> None: 

71 """ 

72 Download and cache a single image with status tracking. 

73 

74 Args: 

75 session: AsyncSession (can be None, will create if needed) 

76 semaphore: Semaphore to limit concurrent downloads 

77 url: External image URL to cache 

78 """ 

79 # Import here to avoid circular imports 

80 from apps.recipes.models import CachedSearchImage 

81 

82 async with semaphore: 

83 try: 

84 # Get or create cache record 

85 cached, created = await sync_to_async(CachedSearchImage.objects.get_or_create)( 

86 external_url=url, defaults={"status": CachedSearchImage.STATUS_PENDING} 

87 ) 

88 

89 # Skip if already successfully cached 

90 if cached.status == CachedSearchImage.STATUS_SUCCESS and cached.image: 

91 return 

92 

93 # Download image 

94 image_data = await self._fetch_image(url) 

95 if not image_data: 

96 cached.status = CachedSearchImage.STATUS_FAILED 

97 await sync_to_async(cached.save)(update_fields=["status"]) 

98 return 

99 

100 # Convert to JPEG for iOS 9 compatibility (no WebP support) 

101 converted_data = self._convert_to_jpeg(image_data) 

102 if not converted_data: 

103 cached.status = CachedSearchImage.STATUS_FAILED 

104 await sync_to_async(cached.save)(update_fields=["status"]) 

105 return 

106 

107 # Generate filename and save 

108 filename = self._generate_filename(url) 

109 cached.image = ContentFile(converted_data, name=filename) 

110 cached.status = CachedSearchImage.STATUS_SUCCESS 

111 await sync_to_async(cached.save)(update_fields=["image", "status"]) 

112 logger.info("Cached 1 search image") 

113 logger.debug("Cached image from %s", url) 

114 

115 except Exception as e: 

116 logger.error("Failed to cache search image: %s", e) 

117 logger.debug("Failed image URL: %s", url) 

118 # Try to mark as failed if we have a record 

119 try: 

120 from apps.recipes.models import CachedSearchImage 

121 

122 cached = await sync_to_async(CachedSearchImage.objects.get)(external_url=url) 

123 cached.status = CachedSearchImage.STATUS_FAILED 

124 await sync_to_async(cached.save)(update_fields=["status"]) 

125 except Exception: 

126 logger.warning("Failed to mark cached image as failed for %s", url, exc_info=True) 

127 

128 async def _fetch_image(self, url: str) -> bytes | None: 

129 """ 

130 Fetch image content from URL with browser profile fallback. 

131 

132 Tries multiple browser profiles if initial request fails. 

133 Browser profiles are configured in fingerprint.py. 

134 

135 Args: 

136 url: Image URL to fetch 

137 

138 Returns: 

139 Image bytes or None if fetch fails 

140 """ 

141 # Validate URL for SSRF protection (returns pinned DNS resolution) 

142 try: 

143 resolved = validate_url(url) 

144 except ValueError: 

145 logger.warning(f"Blocked image URL (SSRF): {url}") 

146 return None 

147 

148 # Try each browser profile with manual redirect following 

149 for profile in BROWSER_PROFILES: 

150 try: 

151 content = await self._fetch_image_safe(url, profile, resolved.curl_resolve) 

152 if content is not None: 

153 return content 

154 except Exception as e: 

155 logger.debug(f"Failed to fetch image {url} with {profile}: {e}") 

156 continue 

157 

158 return None 

159 

160 async def _fetch_image_safe(self, url, profile, curl_resolve=None): 

161 """Fetch image following redirects with per-hop SSRF validation and DNS pinning.""" 

162 from curl_cffi import CurlOpt 

163 

164 current_url = url 

165 current_resolve = curl_resolve or [] 

166 for _ in range(MAX_REDIRECT_HOPS): 

167 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {} 

168 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session: 

169 response = await session.get( 

170 current_url, 

171 timeout=self.DOWNLOAD_TIMEOUT, 

172 allow_redirects=False, 

173 ) 

174 

175 if response.status_code in (301, 302, 303, 307, 308): 

176 location = response.headers.get("location") 

177 if not location: 

178 return None 

179 try: 

180 resolved = validate_redirect_url(location) 

181 except ValueError: 

182 return None 

183 current_url = location 

184 current_resolve = resolved.curl_resolve 

185 continue 

186 

187 if response.status_code in (404, 410): 

188 return None 

189 

190 if response.status_code == 200 and response.content: 

191 if not check_response_size(response, MAX_IMAGE_SIZE): 

192 logger.warning("Image too large: %s", current_url) 

193 return None 

194 if len(response.content) > MAX_IMAGE_SIZE: 

195 return None 

196 content_type = response.headers.get("content-type", "") 

197 if "image" in content_type: 

198 return response.content 

199 if self._looks_like_image(response.content): 

200 return response.content 

201 

202 return None 

203 

204 logger.warning("Too many redirects for image: %s", url) 

205 return None 

206 

207 @staticmethod 

208 def _looks_like_image(data: bytes) -> bool: 

209 """Check if bytes look like an image by inspecting magic bytes.""" 

210 if len(data) < 4: 

211 return False 

212 # JPEG, PNG, GIF, WebP magic bytes 

213 return data[:2] == b"\xff\xd8" or data[:4] == b"\x89PNG" or data[:4] == b"GIF8" or data[:4] == b"RIFF" 

214 

215 async def get_cached_urls_batch(self, urls: list) -> dict: 

216 """ 

217 Batch lookup of cached image URLs for API response. 

218 

219 Args: 

220 urls: List of external image URLs to check 

221 

222 Returns: 

223 Dict mapping external_url → cached_image_url (or None if not cached) 

224 """ 

225 if not urls: 

226 return {} 

227 

228 # Import here to avoid circular imports 

229 from apps.recipes.models import CachedSearchImage 

230 

231 # Query all at once 

232 cached_images = await sync_to_async( 

233 lambda: list( 

234 CachedSearchImage.objects.filter( 

235 external_url__in=urls, 

236 status=CachedSearchImage.STATUS_SUCCESS, 

237 image__isnull=False, 

238 ).exclude(image="") 

239 ) 

240 )() 

241 

242 # Build result dict 

243 result = {} 

244 for cached in cached_images: 

245 if cached.image: 

246 result[cached.external_url] = cached.image.url 

247 # Update access time without saving to DB unnecessarily 

248 # (updated via auto_now on next modification) 

249 

250 return result 

251 

252 def _generate_filename(self, image_url: str) -> str: 

253 """ 

254 Generate unique hash-based filename for cached image. 

255 

256 Always uses .jpg extension since all images are converted to JPEG. 

257 

258 Args: 

259 image_url: External image URL 

260 

261 Returns: 

262 Filename like 'search_{hash}.jpg' 

263 """ 

264 url_hash = hashlib.md5(image_url.encode(), usedforsecurity=False).hexdigest()[:12] 

265 return f"search_{url_hash}.jpg" 

266 

267 def _convert_to_jpeg(self, image_data: bytes) -> bytes | None: 

268 """ 

269 Convert image to JPEG format for iOS 9 compatibility. 

270 

271 iOS 9 Safari doesn't support WebP (added in Safari 14/iOS 14). 

272 This converts any image format (WebP, PNG, etc.) to JPEG. 

273 

274 Args: 

275 image_data: Raw image bytes in any format 

276 

277 Returns: 

278 JPEG image bytes, or None if conversion fails 

279 """ 

280 if len(image_data) > MAX_IMAGE_SIZE: 

281 logger.warning("Image data too large for processing: %d bytes", len(image_data)) 

282 return None 

283 

284 try: 

285 # Open image from bytes (MAX_IMAGE_PIXELS protects against decompression bombs) 

286 img = Image.open(io.BytesIO(image_data)) 

287 

288 # Convert RGBA to RGB (JPEG doesn't support transparency) 

289 if img.mode in ("RGBA", "LA", "P"): 

290 # Create white background 

291 background = Image.new("RGB", img.size, (255, 255, 255)) 

292 if img.mode == "P": 

293 img = img.convert("RGBA") 

294 background.paste(img, mask=img.split()[-1] if img.mode in ("RGBA", "LA") else None) 

295 img = background 

296 elif img.mode != "RGB": 

297 img = img.convert("RGB") 

298 

299 # Save as JPEG 

300 output = io.BytesIO() 

301 img.save(output, format="JPEG", quality=92, optimize=True) 

302 return output.getvalue() 

303 

304 except Exception as e: 

305 logger.error(f"Failed to convert image to JPEG: {e}") 

306 return None 

← Back to Dashboard