Coverage for apps / recipes / services / scraper.py: 75%

257 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-02 13:22 +0000

1""" 

2Recipe scraper service using curl_cffi and recipe-scrapers. 

3""" 

4 

5import hashlib 

6import logging 

7import re 

8import threading 

9from io import BytesIO 

10from urllib.parse import urlparse 

11 

12from apps.recipes.services.sanitizer import sanitize_recipe_data 

13 

14from PIL import Image 

15from asgiref.sync import sync_to_async 

16from django.core.files.base import ContentFile 

17from django.utils import timezone 

18from curl_cffi.requests import AsyncSession 

19from recipe_scrapers import scrape_html 

20 

21from apps.core.validators import ( 

22 MAX_HTML_SIZE, 

23 MAX_IMAGE_SIZE, 

24 MAX_REDIRECT_HOPS, 

25 check_content_size, 

26 check_response_size, 

27 validate_url, 

28 validate_redirect_url, 

29) 

30from apps.recipes.services.fingerprint import BROWSER_PROFILES 

31 

32# Limit decompression bomb attacks via PIL 

33Image.MAX_IMAGE_PIXELS = 178_956_970 # ~180 megapixels 

34 

35logger = logging.getLogger(__name__) 

36 

37 

38class ScraperError(Exception): 

39 pass 

40 

41 

42class FetchError(ScraperError): 

43 pass 

44 

45 

46class ParseError(ScraperError): 

47 pass 

48 

49 

50class RecipeScraper: 

51 """Async recipe scraper with browser fingerprint impersonation.""" 

52 

53 DEFAULT_TIMEOUT = 30 

54 

55 def __init__(self): 

56 self.timeout = self.DEFAULT_TIMEOUT 

57 

58 async def scrape_url(self, url: str, profile: "Profile") -> "Recipe": 

59 """ 

60 Scrape a recipe from a URL and save it to the database. 

61 

62 Args: 

63 url: The recipe URL to scrape 

64 profile: The profile that will own this recipe 

65 

66 Returns: 

67 Recipe model instance 

68 

69 Raises: 

70 FetchError: If the URL cannot be fetched 

71 ParseError: If the HTML cannot be parsed as a recipe 

72 """ 

73 # Import here to avoid circular imports 

74 from apps.recipes.models import Recipe 

75 

76 # Validate URL for SSRF protection (returns pinned DNS resolution) 

77 try: 

78 resolved = validate_url(url) 

79 except ValueError as e: 

80 raise FetchError(str(e)) 

81 

82 # Fetch HTML using pinned DNS to prevent TOCTOU rebinding 

83 html = await self._fetch_html(url, resolved.curl_resolve) 

84 

85 # Parse recipe data 

86 data = self._parse_recipe(html, url) 

87 

88 # Check for cached search image first, then download if needed 

89 image_file = None 

90 if data.get("image_url"): 

91 # Try to reuse cached image from search results 

92 from apps.recipes.models import CachedSearchImage 

93 

94 try: 

95 cached = await sync_to_async(CachedSearchImage.objects.get)( 

96 external_url=data["image_url"], status=CachedSearchImage.STATUS_SUCCESS 

97 ) 

98 

99 if cached.image: 

100 # Reuse cached image file 

101 with cached.image.open("rb") as f: 

102 image_file = ContentFile(f.read()) 

103 

104 # Update access time to prevent cleanup 

105 cached.last_accessed_at = timezone.now() 

106 await sync_to_async(cached.save)(update_fields=["last_accessed_at"]) 

107 

108 logger.info(f"Reused cached image for {data['image_url']}") 

109 

110 except CachedSearchImage.DoesNotExist: 

111 pass 

112 

113 # If no cache, download as normal 

114 if not image_file: 

115 image_file = await self._download_image(data["image_url"]) 

116 

117 # Create recipe record 

118 recipe = Recipe( 

119 profile=profile, 

120 source_url=url, 

121 canonical_url=data.get("canonical_url", ""), 

122 host=data["host"], 

123 site_name=data.get("site_name", ""), 

124 title=data["title"], 

125 author=data.get("author", ""), 

126 description=data.get("description", ""), 

127 image_url=data.get("image_url", ""), 

128 ingredients=data.get("ingredients", []), 

129 ingredient_groups=data.get("ingredient_groups", []), 

130 instructions=data.get("instructions", []), 

131 instructions_text=data.get("instructions_text", ""), 

132 prep_time=data.get("prep_time"), 

133 cook_time=data.get("cook_time"), 

134 total_time=data.get("total_time"), 

135 yields=data.get("yields", ""), 

136 servings=data.get("servings"), 

137 category=data.get("category", ""), 

138 cuisine=data.get("cuisine", ""), 

139 cooking_method=data.get("cooking_method", ""), 

140 keywords=data.get("keywords", []), 

141 dietary_restrictions=data.get("dietary_restrictions", []), 

142 equipment=data.get("equipment", []), 

143 nutrition=data.get("nutrition", {}), 

144 rating=data.get("rating"), 

145 rating_count=data.get("rating_count"), 

146 language=data.get("language", ""), 

147 links=data.get("links", []), 

148 ) 

149 

150 # Save first to get an ID for the image path 

151 await sync_to_async(recipe.save)() 

152 

153 # Attach image if downloaded 

154 if image_file: 

155 filename = self._generate_image_filename(url, data.get("image_url", "")) 

156 await sync_to_async(recipe.image.save)(filename, image_file, save=True) 

157 

158 # Fire-and-forget: Generate AI tips in background thread (non-blocking) 

159 thread = threading.Thread(target=self._generate_tips_background, args=(recipe.id,), daemon=True) 

160 thread.start() 

161 

162 return recipe 

163 

164 def _generate_tips_background(self, recipe_id: int): 

165 """Generate AI tips for a recipe in background thread.""" 

166 try: 

167 import django 

168 

169 django.setup() # Ensure Django is configured in thread 

170 

171 from apps.core.models import AppSettings 

172 from apps.ai.services.tips import generate_tips 

173 

174 # Check if AI is available 

175 settings_obj = AppSettings.get() 

176 if not settings_obj.openrouter_api_key: 

177 logger.debug(f"Skipping tips generation for recipe {recipe_id}: No API key") 

178 return 

179 

180 # Generate tips 

181 generate_tips(recipe_id) 

182 logger.info(f"Auto-generated tips for recipe {recipe_id}") 

183 

184 except Exception as e: 

185 # Log but don't fail - tips generation is optional 

186 logger.warning(f"Failed to auto-generate tips for recipe {recipe_id}: {e}") 

187 

188 async def _fetch_html(self, url: str, curl_resolve: list[str] | None = None) -> str: 

189 """ 

190 Fetch HTML from URL with browser impersonation. 

191 

192 Follows redirects manually with per-hop SSRF validation (max 5 hops). 

193 Enforces response size limit (10MB). 

194 Tries multiple browser profiles if initial request fails. 

195 

196 Args: 

197 url: URL to fetch 

198 curl_resolve: DNS pinning list from validate_url to prevent TOCTOU rebinding 

199 """ 

200 errors = [] 

201 

202 for profile in BROWSER_PROFILES: 

203 try: 

204 html = await self._fetch_with_redirects(url, profile, MAX_HTML_SIZE, curl_resolve) 

205 if html is not None: 

206 return html 

207 errors.append(f"{profile}: empty response") 

208 except FetchError: 

209 raise 

210 except ValueError as e: 

211 raise FetchError(str(e)) 

212 except Exception as e: 

213 errors.append(f"{profile}: {str(e)}") 

214 continue 

215 

216 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}") 

217 

218 async def _fetch_with_redirects(self, url, profile, max_size, curl_resolve=None): 

219 """Fetch URL following redirects with per-hop SSRF validation and DNS pinning.""" 

220 from curl_cffi import CurlOpt 

221 

222 current_url = url 

223 current_resolve = curl_resolve or [] 

224 for _ in range(MAX_REDIRECT_HOPS): 

225 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {} 

226 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session: 

227 response = await session.get( 

228 current_url, 

229 timeout=self.timeout, 

230 allow_redirects=False, 

231 ) 

232 

233 if response.status_code in (301, 302, 303, 307, 308): 

234 location = response.headers.get("location") 

235 if not location: 

236 raise FetchError("Redirect without Location header") 

237 resolved = validate_redirect_url(location) 

238 current_url = location 

239 current_resolve = resolved.curl_resolve 

240 continue 

241 

242 if response.status_code == 200: 

243 if not check_response_size(response, max_size): 

244 raise FetchError(f"Response too large (Content-Length > {max_size})") 

245 content = response.text 

246 check_content_size(content.encode("utf-8", errors="replace"), max_size) 

247 return content 

248 

249 if response.status_code == 404: 

250 raise FetchError("Recipe page not found at that URL") 

251 if response.status_code == 429: 

252 raise FetchError("Recipe site is temporarily rate-limiting — try again shortly") 

253 return None # 403/5xx: let other browser profiles try 

254 

255 raise FetchError(f"Too many redirects (>{MAX_REDIRECT_HOPS})") 

256 

257 def _parse_recipe(self, html: str, url: str) -> dict: 

258 """ 

259 Parse recipe data from HTML using recipe-scrapers. 

260 """ 

261 try: 

262 # supported_only=False allows scraping from any domain using schema.org 

263 scraper = scrape_html(html, org_url=url, supported_only=False) 

264 except Exception as e: 

265 raise ParseError(f"Failed to parse recipe: {str(e)}") 

266 

267 # Extract host from URL 

268 parsed_url = urlparse(url) 

269 host = parsed_url.netloc.replace("www.", "") 

270 

271 # Build recipe data dict with safe attribute access 

272 data = { 

273 "host": host, 

274 "title": self._safe_get(scraper, "title", ""), 

275 "canonical_url": self._safe_get(scraper, "canonical_url", ""), 

276 "site_name": self._safe_get(scraper, "site_name", ""), 

277 "author": self._safe_get(scraper, "author", ""), 

278 "description": self._safe_get(scraper, "description", ""), 

279 "image_url": self._safe_get(scraper, "image", ""), 

280 "ingredients": self._safe_get(scraper, "ingredients", []), 

281 "ingredient_groups": self._safe_get_ingredient_groups(scraper), 

282 "instructions": self._safe_get(scraper, "instructions_list", []), 

283 "instructions_text": self._safe_get(scraper, "instructions", ""), 

284 "prep_time": self._parse_time(self._safe_get(scraper, "prep_time")), 

285 "cook_time": self._parse_time(self._safe_get(scraper, "cook_time")), 

286 "total_time": self._parse_time(self._safe_get(scraper, "total_time")), 

287 "yields": self._safe_get(scraper, "yields", ""), 

288 "servings": self._parse_servings(self._safe_get(scraper, "yields", "")), 

289 "category": self._safe_get(scraper, "category", ""), 

290 "cuisine": self._safe_get(scraper, "cuisine", ""), 

291 "cooking_method": self._safe_get(scraper, "cooking_method", ""), 

292 "keywords": self._safe_get(scraper, "keywords", []), 

293 "dietary_restrictions": self._safe_get(scraper, "dietary_restrictions", []), 

294 "equipment": self._safe_get(scraper, "equipment", []), 

295 "nutrition": self._safe_get(scraper, "nutrients", {}), 

296 "rating": self._parse_rating(self._safe_get(scraper, "ratings")), 

297 "rating_count": self._parse_rating_count(self._safe_get(scraper, "ratings_count")), 

298 "language": self._safe_get(scraper, "language", ""), 

299 "links": self._safe_get(scraper, "links", []), 

300 } 

301 

302 if not data["title"]: 

303 raise ParseError("Recipe has no title") 

304 

305 # Sanitize all text fields to strip HTML (defense-in-depth against stored XSS) 

306 sanitize_recipe_data(data) 

307 

308 return data 

309 

310 def _safe_get(self, scraper, attr: str, default=None): 

311 """Safely get an attribute from the scraper.""" 

312 try: 

313 method = getattr(scraper, attr, None) 

314 if callable(method): 

315 result = method() 

316 return result if result is not None else default 

317 return default 

318 except Exception: 

319 logger.debug("Failed to get %s from scraper", attr, exc_info=True) 

320 return default 

321 

322 def _safe_get_ingredient_groups(self, scraper) -> list: 

323 """Get ingredient groups if available.""" 

324 try: 

325 groups = scraper.ingredient_groups() 

326 if groups: 

327 return [ 

328 {"purpose": getattr(g, "purpose", ""), "ingredients": getattr(g, "ingredients", [])} for g in groups 

329 ] 

330 except Exception: 

331 logger.warning("Failed to get ingredient groups from scraper", exc_info=True) 

332 return [] 

333 

334 def _parse_time(self, value) -> int | None: 

335 """Parse time value to minutes.""" 

336 if value is None: 

337 return None 

338 if isinstance(value, (int, float)): 

339 return int(value) 

340 if isinstance(value, str): 

341 # Try to extract number 

342 match = re.search(r"(\d+)", value) 

343 if match: 

344 return int(match.group(1)) 

345 return None 

346 

347 def _parse_servings(self, yields: str) -> int | None: 

348 """Extract serving count from yields string.""" 

349 if not yields: 

350 return None 

351 match = re.search(r"(\d+)", yields) 

352 if match: 

353 return int(match.group(1)) 

354 return None 

355 

356 def _parse_rating(self, value) -> float | None: 

357 """Parse rating value to float.""" 

358 if value is None: 

359 return None 

360 try: 

361 return float(value) 

362 except (ValueError, TypeError): 

363 return None 

364 

365 def _parse_rating_count(self, value) -> int | None: 

366 """Parse rating count to int.""" 

367 if value is None: 

368 return None 

369 try: 

370 return int(value) 

371 except (ValueError, TypeError): 

372 return None 

373 

374 async def _download_image(self, image_url: str) -> ContentFile | None: 

375 """ 

376 Download recipe image and return as ContentFile. 

377 

378 Validates image URL against SSRF blocklist before fetching. 

379 Follows redirects manually with per-hop validation (max 5 hops). 

380 Enforces response size limit (50MB). 

381 WebP images are converted to JPEG for iOS 9 compatibility. 

382 """ 

383 if not image_url: 

384 return None 

385 

386 # Validate image URL for SSRF protection (FR-001) 

387 try: 

388 resolved = validate_url(image_url) 

389 except ValueError: 

390 logger.warning("Blocked image URL (SSRF): %s", image_url) 

391 return None 

392 

393 for profile in BROWSER_PROFILES: 

394 try: 

395 content = await self._fetch_image_with_redirects(image_url, profile, resolved.curl_resolve) 

396 if content is not None: 

397 content = self._convert_webp_to_jpeg(content) 

398 return ContentFile(content) 

399 except Exception as e: 

400 logger.warning( 

401 "Failed to download image %s with %s: %s", 

402 image_url, 

403 profile, 

404 e, 

405 ) 

406 continue 

407 

408 return None 

409 

410 async def _fetch_image_with_redirects(self, url, profile, curl_resolve=None): 

411 """Fetch image following redirects with per-hop SSRF validation and DNS pinning.""" 

412 from curl_cffi import CurlOpt 

413 

414 current_url = url 

415 current_resolve = curl_resolve or [] 

416 for _ in range(MAX_REDIRECT_HOPS): 

417 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {} 

418 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session: 

419 response = await session.get( 

420 current_url, 

421 timeout=self.timeout, 

422 allow_redirects=False, 

423 ) 

424 

425 if response.status_code in (301, 302, 303, 307, 308): 

426 location = response.headers.get("location") 

427 if not location: 

428 return None 

429 try: 

430 resolved = validate_redirect_url(location) 

431 except ValueError: 

432 return None 

433 current_url = location 

434 current_resolve = resolved.curl_resolve 

435 continue 

436 

437 if response.status_code == 200: 

438 content_type = response.headers.get("content-type", "") 

439 if "image" not in content_type and not self._is_image_url(current_url): 

440 return None 

441 if not check_response_size(response, MAX_IMAGE_SIZE): 

442 logger.warning("Image too large: %s", current_url) 

443 return None 

444 content = response.content 

445 if len(content) > MAX_IMAGE_SIZE: 

446 logger.warning("Image content too large: %s", current_url) 

447 return None 

448 return content 

449 

450 return None 

451 

452 logger.warning("Too many redirects for image: %s", url) 

453 return None 

454 

455 def _convert_webp_to_jpeg(self, content: bytes) -> bytes: 

456 """Convert WebP images to JPEG for iOS 9 compatibility. 

457 

458 Also resizes very large images to reduce file size. 

459 Rejects images that exceed the size limit (decompression bomb protection). 

460 """ 

461 if len(content) > MAX_IMAGE_SIZE: 

462 logger.warning("Image content too large for processing: %d bytes", len(content)) 

463 return content 

464 

465 try: 

466 img = Image.open(BytesIO(content)) 

467 

468 # Check if conversion is needed (WebP or very large) 

469 needs_conversion = img.format == "WEBP" 

470 needs_resize = img.width > 1200 or img.height > 1200 

471 

472 if not needs_conversion and not needs_resize: 

473 return content 

474 

475 # Resize if too large (max 1200px on longest side) 

476 if needs_resize: 

477 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS) 

478 

479 # Convert to RGB if needed (for JPEG) 

480 if img.mode in ("RGBA", "P"): 

481 img = img.convert("RGB") 

482 

483 # Save as JPEG 

484 output = BytesIO() 

485 img.save(output, format="JPEG", quality=85, optimize=True) 

486 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}") 

487 return output.getvalue() 

488 

489 except Exception as e: 

490 logger.warning(f"Image conversion failed: {e}, using original") 

491 return content 

492 

493 def _is_image_url(self, url: str) -> bool: 

494 """Check if URL looks like an image.""" 

495 image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".webp") 

496 parsed = urlparse(url) 

497 return parsed.path.lower().endswith(image_extensions) 

498 

499 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str: 

500 """Generate a unique filename for the recipe image. 

501 

502 Always uses .jpg extension since images are converted to JPEG 

503 for iOS 9 compatibility. 

504 """ 

505 # Create hash from URLs for uniqueness 

506 url_hash = hashlib.md5(f"{recipe_url}{image_url}".encode(), usedforsecurity=False).hexdigest()[:12] 

507 

508 return f"recipe_{url_hash}.jpg" 

← Back to Dashboard