Coverage for apps / recipes / services / scraper.py: 76%

253 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-12 10:49 +0000

1""" 

2Recipe scraper service using curl_cffi and recipe-scrapers. 

3""" 

4 

5import hashlib 

6import logging 

7import re 

8import threading 

9from io import BytesIO 

10from urllib.parse import urlparse 

11 

12from apps.recipes.services.sanitizer import sanitize_recipe_data 

13 

14from PIL import Image 

15from asgiref.sync import sync_to_async 

16from django.core.files.base import ContentFile 

17from django.utils import timezone 

18from curl_cffi.requests import AsyncSession 

19from recipe_scrapers import scrape_html 

20 

21from apps.core.validators import ( 

22 MAX_HTML_SIZE, 

23 MAX_IMAGE_SIZE, 

24 MAX_REDIRECT_HOPS, 

25 check_content_size, 

26 check_response_size, 

27 validate_url, 

28 validate_redirect_url, 

29) 

30from apps.recipes.services.fingerprint import BROWSER_PROFILES 

31 

32# Limit decompression bomb attacks via PIL 

33Image.MAX_IMAGE_PIXELS = 178_956_970 # ~180 megapixels 

34 

35logger = logging.getLogger(__name__) 

36 

37 

38class ScraperError(Exception): 

39 """Base exception for scraper errors.""" 

40 

41 pass 

42 

43 

44class FetchError(ScraperError): 

45 """Failed to fetch URL.""" 

46 

47 pass 

48 

49 

50class ParseError(ScraperError): 

51 """Failed to parse recipe from HTML.""" 

52 

53 pass 

54 

55 

56class RecipeScraper: 

57 """ 

58 Async recipe scraper with browser fingerprint impersonation. 

59 

60 Uses curl_cffi to bypass anti-bot measures and recipe-scrapers 

61 to parse structured recipe data from HTML. 

62 

63 Browser profiles are centralized in fingerprint.py for maintainability. 

64 """ 

65 

66 DEFAULT_TIMEOUT = 30 

67 

68 def __init__(self): 

69 self.timeout = self.DEFAULT_TIMEOUT 

70 

71 async def scrape_url(self, url: str, profile: "Profile") -> "Recipe": 

72 """ 

73 Scrape a recipe from a URL and save it to the database. 

74 

75 Args: 

76 url: The recipe URL to scrape 

77 profile: The profile that will own this recipe 

78 

79 Returns: 

80 Recipe model instance 

81 

82 Raises: 

83 FetchError: If the URL cannot be fetched 

84 ParseError: If the HTML cannot be parsed as a recipe 

85 """ 

86 # Import here to avoid circular imports 

87 from apps.recipes.models import Recipe 

88 

89 # Validate URL for SSRF protection (returns pinned DNS resolution) 

90 try: 

91 resolved = validate_url(url) 

92 except ValueError as e: 

93 raise FetchError(str(e)) 

94 

95 # Fetch HTML using pinned DNS to prevent TOCTOU rebinding 

96 html = await self._fetch_html(url, resolved.curl_resolve) 

97 

98 # Parse recipe data 

99 data = self._parse_recipe(html, url) 

100 

101 # Check for cached search image first, then download if needed 

102 image_file = None 

103 if data.get("image_url"): 

104 # Try to reuse cached image from search results 

105 from apps.recipes.models import CachedSearchImage 

106 

107 try: 

108 cached = await sync_to_async(CachedSearchImage.objects.get)( 

109 external_url=data["image_url"], status=CachedSearchImage.STATUS_SUCCESS 

110 ) 

111 

112 if cached.image: 

113 # Reuse cached image file 

114 with cached.image.open("rb") as f: 

115 image_file = ContentFile(f.read()) 

116 

117 # Update access time to prevent cleanup 

118 cached.last_accessed_at = timezone.now() 

119 await sync_to_async(cached.save)(update_fields=["last_accessed_at"]) 

120 

121 logger.info(f"Reused cached image for {data['image_url']}") 

122 

123 except CachedSearchImage.DoesNotExist: 

124 pass 

125 

126 # If no cache, download as normal 

127 if not image_file: 

128 image_file = await self._download_image(data["image_url"]) 

129 

130 # Create recipe record 

131 recipe = Recipe( 

132 profile=profile, 

133 source_url=url, 

134 canonical_url=data.get("canonical_url", ""), 

135 host=data["host"], 

136 site_name=data.get("site_name", ""), 

137 title=data["title"], 

138 author=data.get("author", ""), 

139 description=data.get("description", ""), 

140 image_url=data.get("image_url", ""), 

141 ingredients=data.get("ingredients", []), 

142 ingredient_groups=data.get("ingredient_groups", []), 

143 instructions=data.get("instructions", []), 

144 instructions_text=data.get("instructions_text", ""), 

145 prep_time=data.get("prep_time"), 

146 cook_time=data.get("cook_time"), 

147 total_time=data.get("total_time"), 

148 yields=data.get("yields", ""), 

149 servings=data.get("servings"), 

150 category=data.get("category", ""), 

151 cuisine=data.get("cuisine", ""), 

152 cooking_method=data.get("cooking_method", ""), 

153 keywords=data.get("keywords", []), 

154 dietary_restrictions=data.get("dietary_restrictions", []), 

155 equipment=data.get("equipment", []), 

156 nutrition=data.get("nutrition", {}), 

157 rating=data.get("rating"), 

158 rating_count=data.get("rating_count"), 

159 language=data.get("language", ""), 

160 links=data.get("links", []), 

161 ) 

162 

163 # Save first to get an ID for the image path 

164 await sync_to_async(recipe.save)() 

165 

166 # Attach image if downloaded 

167 if image_file: 

168 filename = self._generate_image_filename(url, data.get("image_url", "")) 

169 await sync_to_async(recipe.image.save)(filename, image_file, save=True) 

170 

171 # Fire-and-forget: Generate AI tips in background thread (non-blocking) 

172 thread = threading.Thread(target=self._generate_tips_background, args=(recipe.id,), daemon=True) 

173 thread.start() 

174 

175 return recipe 

176 

177 def _generate_tips_background(self, recipe_id: int): 

178 """Generate AI tips for a recipe in background thread.""" 

179 try: 

180 import django 

181 

182 django.setup() # Ensure Django is configured in thread 

183 

184 from apps.core.models import AppSettings 

185 from apps.ai.services.tips import generate_tips 

186 

187 # Check if AI is available 

188 settings_obj = AppSettings.get() 

189 if not settings_obj.openrouter_api_key: 

190 logger.debug(f"Skipping tips generation for recipe {recipe_id}: No API key") 

191 return 

192 

193 # Generate tips 

194 generate_tips(recipe_id) 

195 logger.info(f"Auto-generated tips for recipe {recipe_id}") 

196 

197 except Exception as e: 

198 # Log but don't fail - tips generation is optional 

199 logger.warning(f"Failed to auto-generate tips for recipe {recipe_id}: {e}") 

200 

201 async def _fetch_html(self, url: str, curl_resolve: list[str] | None = None) -> str: 

202 """ 

203 Fetch HTML from URL with browser impersonation. 

204 

205 Follows redirects manually with per-hop SSRF validation (max 5 hops). 

206 Enforces response size limit (10MB). 

207 Tries multiple browser profiles if initial request fails. 

208 

209 Args: 

210 url: URL to fetch 

211 curl_resolve: DNS pinning list from validate_url to prevent TOCTOU rebinding 

212 """ 

213 errors = [] 

214 

215 for profile in BROWSER_PROFILES: 

216 try: 

217 html = await self._fetch_with_redirects(url, profile, MAX_HTML_SIZE, curl_resolve) 

218 if html is not None: 

219 return html 

220 errors.append(f"{profile}: empty response") 

221 except FetchError: 

222 raise 

223 except ValueError as e: 

224 raise FetchError(str(e)) 

225 except Exception as e: 

226 errors.append(f"{profile}: {str(e)}") 

227 continue 

228 

229 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}") 

230 

231 async def _fetch_with_redirects(self, url, profile, max_size, curl_resolve=None): 

232 """Fetch URL following redirects with per-hop SSRF validation and DNS pinning.""" 

233 from curl_cffi import CurlOpt 

234 

235 current_url = url 

236 current_resolve = curl_resolve or [] 

237 for _ in range(MAX_REDIRECT_HOPS): 

238 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {} 

239 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session: 

240 response = await session.get( 

241 current_url, 

242 timeout=self.timeout, 

243 allow_redirects=False, 

244 ) 

245 

246 if response.status_code in (301, 302, 303, 307, 308): 

247 location = response.headers.get("location") 

248 if not location: 

249 raise FetchError("Redirect without Location header") 

250 resolved = validate_redirect_url(location) 

251 current_url = location 

252 current_resolve = resolved.curl_resolve 

253 continue 

254 

255 if response.status_code == 200: 

256 if not check_response_size(response, max_size): 

257 raise FetchError(f"Response too large (Content-Length > {max_size})") 

258 content = response.text 

259 check_content_size(content.encode("utf-8", errors="replace"), max_size) 

260 return content 

261 

262 return None 

263 

264 raise FetchError(f"Too many redirects (>{MAX_REDIRECT_HOPS})") 

265 

266 def _parse_recipe(self, html: str, url: str) -> dict: 

267 """ 

268 Parse recipe data from HTML using recipe-scrapers. 

269 """ 

270 try: 

271 # supported_only=False allows scraping from any domain using schema.org 

272 scraper = scrape_html(html, org_url=url, supported_only=False) 

273 except Exception as e: 

274 raise ParseError(f"Failed to parse recipe: {str(e)}") 

275 

276 # Extract host from URL 

277 parsed_url = urlparse(url) 

278 host = parsed_url.netloc.replace("www.", "") 

279 

280 # Build recipe data dict with safe attribute access 

281 data = { 

282 "host": host, 

283 "title": self._safe_get(scraper, "title", ""), 

284 "canonical_url": self._safe_get(scraper, "canonical_url", ""), 

285 "site_name": self._safe_get(scraper, "site_name", ""), 

286 "author": self._safe_get(scraper, "author", ""), 

287 "description": self._safe_get(scraper, "description", ""), 

288 "image_url": self._safe_get(scraper, "image", ""), 

289 "ingredients": self._safe_get(scraper, "ingredients", []), 

290 "ingredient_groups": self._safe_get_ingredient_groups(scraper), 

291 "instructions": self._safe_get(scraper, "instructions_list", []), 

292 "instructions_text": self._safe_get(scraper, "instructions", ""), 

293 "prep_time": self._parse_time(self._safe_get(scraper, "prep_time")), 

294 "cook_time": self._parse_time(self._safe_get(scraper, "cook_time")), 

295 "total_time": self._parse_time(self._safe_get(scraper, "total_time")), 

296 "yields": self._safe_get(scraper, "yields", ""), 

297 "servings": self._parse_servings(self._safe_get(scraper, "yields", "")), 

298 "category": self._safe_get(scraper, "category", ""), 

299 "cuisine": self._safe_get(scraper, "cuisine", ""), 

300 "cooking_method": self._safe_get(scraper, "cooking_method", ""), 

301 "keywords": self._safe_get(scraper, "keywords", []), 

302 "dietary_restrictions": self._safe_get(scraper, "dietary_restrictions", []), 

303 "equipment": self._safe_get(scraper, "equipment", []), 

304 "nutrition": self._safe_get(scraper, "nutrients", {}), 

305 "rating": self._parse_rating(self._safe_get(scraper, "ratings")), 

306 "rating_count": self._parse_rating_count(self._safe_get(scraper, "ratings_count")), 

307 "language": self._safe_get(scraper, "language", ""), 

308 "links": self._safe_get(scraper, "links", []), 

309 } 

310 

311 if not data["title"]: 

312 raise ParseError("Recipe has no title") 

313 

314 # Sanitize all text fields to strip HTML (defense-in-depth against stored XSS) 

315 sanitize_recipe_data(data) 

316 

317 return data 

318 

319 def _safe_get(self, scraper, attr: str, default=None): 

320 """Safely get an attribute from the scraper.""" 

321 try: 

322 method = getattr(scraper, attr, None) 

323 if callable(method): 

324 result = method() 

325 return result if result is not None else default 

326 return default 

327 except Exception: 

328 logger.debug("Failed to get %s from scraper", attr, exc_info=True) 

329 return default 

330 

331 def _safe_get_ingredient_groups(self, scraper) -> list: 

332 """Get ingredient groups if available.""" 

333 try: 

334 groups = scraper.ingredient_groups() 

335 if groups: 

336 return [ 

337 {"purpose": getattr(g, "purpose", ""), "ingredients": getattr(g, "ingredients", [])} for g in groups 

338 ] 

339 except Exception: 

340 logger.warning("Failed to get ingredient groups from scraper", exc_info=True) 

341 return [] 

342 

343 def _parse_time(self, value) -> int | None: 

344 """Parse time value to minutes.""" 

345 if value is None: 

346 return None 

347 if isinstance(value, (int, float)): 

348 return int(value) 

349 if isinstance(value, str): 

350 # Try to extract number 

351 match = re.search(r"(\d+)", value) 

352 if match: 

353 return int(match.group(1)) 

354 return None 

355 

356 def _parse_servings(self, yields: str) -> int | None: 

357 """Extract serving count from yields string.""" 

358 if not yields: 

359 return None 

360 match = re.search(r"(\d+)", yields) 

361 if match: 

362 return int(match.group(1)) 

363 return None 

364 

365 def _parse_rating(self, value) -> float | None: 

366 """Parse rating value to float.""" 

367 if value is None: 

368 return None 

369 try: 

370 return float(value) 

371 except (ValueError, TypeError): 

372 return None 

373 

374 def _parse_rating_count(self, value) -> int | None: 

375 """Parse rating count to int.""" 

376 if value is None: 

377 return None 

378 try: 

379 return int(value) 

380 except (ValueError, TypeError): 

381 return None 

382 

383 async def _download_image(self, image_url: str) -> ContentFile | None: 

384 """ 

385 Download recipe image and return as ContentFile. 

386 

387 Validates image URL against SSRF blocklist before fetching. 

388 Follows redirects manually with per-hop validation (max 5 hops). 

389 Enforces response size limit (50MB). 

390 WebP images are converted to JPEG for iOS 9 compatibility. 

391 """ 

392 if not image_url: 

393 return None 

394 

395 # Validate image URL for SSRF protection (FR-001) 

396 try: 

397 resolved = validate_url(image_url) 

398 except ValueError: 

399 logger.warning("Blocked image URL (SSRF): %s", image_url) 

400 return None 

401 

402 for profile in BROWSER_PROFILES: 

403 try: 

404 content = await self._fetch_image_with_redirects(image_url, profile, resolved.curl_resolve) 

405 if content is not None: 

406 content = self._convert_webp_to_jpeg(content) 

407 return ContentFile(content) 

408 except Exception as e: 

409 logger.warning( 

410 "Failed to download image %s with %s: %s", 

411 image_url, 

412 profile, 

413 e, 

414 ) 

415 continue 

416 

417 return None 

418 

419 async def _fetch_image_with_redirects(self, url, profile, curl_resolve=None): 

420 """Fetch image following redirects with per-hop SSRF validation and DNS pinning.""" 

421 from curl_cffi import CurlOpt 

422 

423 current_url = url 

424 current_resolve = curl_resolve or [] 

425 for _ in range(MAX_REDIRECT_HOPS): 

426 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {} 

427 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session: 

428 response = await session.get( 

429 current_url, 

430 timeout=self.timeout, 

431 allow_redirects=False, 

432 ) 

433 

434 if response.status_code in (301, 302, 303, 307, 308): 

435 location = response.headers.get("location") 

436 if not location: 

437 return None 

438 try: 

439 resolved = validate_redirect_url(location) 

440 except ValueError: 

441 return None 

442 current_url = location 

443 current_resolve = resolved.curl_resolve 

444 continue 

445 

446 if response.status_code == 200: 

447 content_type = response.headers.get("content-type", "") 

448 if "image" not in content_type and not self._is_image_url(current_url): 

449 return None 

450 if not check_response_size(response, MAX_IMAGE_SIZE): 

451 logger.warning("Image too large: %s", current_url) 

452 return None 

453 content = response.content 

454 if len(content) > MAX_IMAGE_SIZE: 

455 logger.warning("Image content too large: %s", current_url) 

456 return None 

457 return content 

458 

459 return None 

460 

461 logger.warning("Too many redirects for image: %s", url) 

462 return None 

463 

464 def _convert_webp_to_jpeg(self, content: bytes) -> bytes: 

465 """Convert WebP images to JPEG for iOS 9 compatibility. 

466 

467 Also resizes very large images to reduce file size. 

468 Rejects images that exceed the size limit (decompression bomb protection). 

469 """ 

470 if len(content) > MAX_IMAGE_SIZE: 

471 logger.warning("Image content too large for processing: %d bytes", len(content)) 

472 return content 

473 

474 try: 

475 img = Image.open(BytesIO(content)) 

476 

477 # Check if conversion is needed (WebP or very large) 

478 needs_conversion = img.format == "WEBP" 

479 needs_resize = img.width > 1200 or img.height > 1200 

480 

481 if not needs_conversion and not needs_resize: 

482 return content 

483 

484 # Resize if too large (max 1200px on longest side) 

485 if needs_resize: 

486 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS) 

487 

488 # Convert to RGB if needed (for JPEG) 

489 if img.mode in ("RGBA", "P"): 

490 img = img.convert("RGB") 

491 

492 # Save as JPEG 

493 output = BytesIO() 

494 img.save(output, format="JPEG", quality=85, optimize=True) 

495 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}") 

496 return output.getvalue() 

497 

498 except Exception as e: 

499 logger.warning(f"Image conversion failed: {e}, using original") 

500 return content 

501 

502 def _is_image_url(self, url: str) -> bool: 

503 """Check if URL looks like an image.""" 

504 image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".webp") 

505 parsed = urlparse(url) 

506 return parsed.path.lower().endswith(image_extensions) 

507 

508 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str: 

509 """Generate a unique filename for the recipe image. 

510 

511 Always uses .jpg extension since images are converted to JPEG 

512 for iOS 9 compatibility. 

513 """ 

514 # Create hash from URLs for uniqueness 

515 url_hash = hashlib.md5(f"{recipe_url}{image_url}".encode(), usedforsecurity=False).hexdigest()[:12] 

516 

517 return f"recipe_{url_hash}.jpg" 

← Back to Dashboard