Coverage for apps / recipes / services / scraper.py: 76%
253 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-12 10:49 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-12 10:49 +0000
1"""
2Recipe scraper service using curl_cffi and recipe-scrapers.
3"""
5import hashlib
6import logging
7import re
8import threading
9from io import BytesIO
10from urllib.parse import urlparse
12from apps.recipes.services.sanitizer import sanitize_recipe_data
14from PIL import Image
15from asgiref.sync import sync_to_async
16from django.core.files.base import ContentFile
17from django.utils import timezone
18from curl_cffi.requests import AsyncSession
19from recipe_scrapers import scrape_html
21from apps.core.validators import (
22 MAX_HTML_SIZE,
23 MAX_IMAGE_SIZE,
24 MAX_REDIRECT_HOPS,
25 check_content_size,
26 check_response_size,
27 validate_url,
28 validate_redirect_url,
29)
30from apps.recipes.services.fingerprint import BROWSER_PROFILES
32# Limit decompression bomb attacks via PIL
33Image.MAX_IMAGE_PIXELS = 178_956_970 # ~180 megapixels
35logger = logging.getLogger(__name__)
38class ScraperError(Exception):
39 """Base exception for scraper errors."""
41 pass
44class FetchError(ScraperError):
45 """Failed to fetch URL."""
47 pass
50class ParseError(ScraperError):
51 """Failed to parse recipe from HTML."""
53 pass
56class RecipeScraper:
57 """
58 Async recipe scraper with browser fingerprint impersonation.
60 Uses curl_cffi to bypass anti-bot measures and recipe-scrapers
61 to parse structured recipe data from HTML.
63 Browser profiles are centralized in fingerprint.py for maintainability.
64 """
66 DEFAULT_TIMEOUT = 30
68 def __init__(self):
69 self.timeout = self.DEFAULT_TIMEOUT
71 async def scrape_url(self, url: str, profile: "Profile") -> "Recipe":
72 """
73 Scrape a recipe from a URL and save it to the database.
75 Args:
76 url: The recipe URL to scrape
77 profile: The profile that will own this recipe
79 Returns:
80 Recipe model instance
82 Raises:
83 FetchError: If the URL cannot be fetched
84 ParseError: If the HTML cannot be parsed as a recipe
85 """
86 # Import here to avoid circular imports
87 from apps.recipes.models import Recipe
89 # Validate URL for SSRF protection (returns pinned DNS resolution)
90 try:
91 resolved = validate_url(url)
92 except ValueError as e:
93 raise FetchError(str(e))
95 # Fetch HTML using pinned DNS to prevent TOCTOU rebinding
96 html = await self._fetch_html(url, resolved.curl_resolve)
98 # Parse recipe data
99 data = self._parse_recipe(html, url)
101 # Check for cached search image first, then download if needed
102 image_file = None
103 if data.get("image_url"):
104 # Try to reuse cached image from search results
105 from apps.recipes.models import CachedSearchImage
107 try:
108 cached = await sync_to_async(CachedSearchImage.objects.get)(
109 external_url=data["image_url"], status=CachedSearchImage.STATUS_SUCCESS
110 )
112 if cached.image:
113 # Reuse cached image file
114 with cached.image.open("rb") as f:
115 image_file = ContentFile(f.read())
117 # Update access time to prevent cleanup
118 cached.last_accessed_at = timezone.now()
119 await sync_to_async(cached.save)(update_fields=["last_accessed_at"])
121 logger.info(f"Reused cached image for {data['image_url']}")
123 except CachedSearchImage.DoesNotExist:
124 pass
126 # If no cache, download as normal
127 if not image_file:
128 image_file = await self._download_image(data["image_url"])
130 # Create recipe record
131 recipe = Recipe(
132 profile=profile,
133 source_url=url,
134 canonical_url=data.get("canonical_url", ""),
135 host=data["host"],
136 site_name=data.get("site_name", ""),
137 title=data["title"],
138 author=data.get("author", ""),
139 description=data.get("description", ""),
140 image_url=data.get("image_url", ""),
141 ingredients=data.get("ingredients", []),
142 ingredient_groups=data.get("ingredient_groups", []),
143 instructions=data.get("instructions", []),
144 instructions_text=data.get("instructions_text", ""),
145 prep_time=data.get("prep_time"),
146 cook_time=data.get("cook_time"),
147 total_time=data.get("total_time"),
148 yields=data.get("yields", ""),
149 servings=data.get("servings"),
150 category=data.get("category", ""),
151 cuisine=data.get("cuisine", ""),
152 cooking_method=data.get("cooking_method", ""),
153 keywords=data.get("keywords", []),
154 dietary_restrictions=data.get("dietary_restrictions", []),
155 equipment=data.get("equipment", []),
156 nutrition=data.get("nutrition", {}),
157 rating=data.get("rating"),
158 rating_count=data.get("rating_count"),
159 language=data.get("language", ""),
160 links=data.get("links", []),
161 )
163 # Save first to get an ID for the image path
164 await sync_to_async(recipe.save)()
166 # Attach image if downloaded
167 if image_file:
168 filename = self._generate_image_filename(url, data.get("image_url", ""))
169 await sync_to_async(recipe.image.save)(filename, image_file, save=True)
171 # Fire-and-forget: Generate AI tips in background thread (non-blocking)
172 thread = threading.Thread(target=self._generate_tips_background, args=(recipe.id,), daemon=True)
173 thread.start()
175 return recipe
177 def _generate_tips_background(self, recipe_id: int):
178 """Generate AI tips for a recipe in background thread."""
179 try:
180 import django
182 django.setup() # Ensure Django is configured in thread
184 from apps.core.models import AppSettings
185 from apps.ai.services.tips import generate_tips
187 # Check if AI is available
188 settings_obj = AppSettings.get()
189 if not settings_obj.openrouter_api_key:
190 logger.debug(f"Skipping tips generation for recipe {recipe_id}: No API key")
191 return
193 # Generate tips
194 generate_tips(recipe_id)
195 logger.info(f"Auto-generated tips for recipe {recipe_id}")
197 except Exception as e:
198 # Log but don't fail - tips generation is optional
199 logger.warning(f"Failed to auto-generate tips for recipe {recipe_id}: {e}")
201 async def _fetch_html(self, url: str, curl_resolve: list[str] | None = None) -> str:
202 """
203 Fetch HTML from URL with browser impersonation.
205 Follows redirects manually with per-hop SSRF validation (max 5 hops).
206 Enforces response size limit (10MB).
207 Tries multiple browser profiles if initial request fails.
209 Args:
210 url: URL to fetch
211 curl_resolve: DNS pinning list from validate_url to prevent TOCTOU rebinding
212 """
213 errors = []
215 for profile in BROWSER_PROFILES:
216 try:
217 html = await self._fetch_with_redirects(url, profile, MAX_HTML_SIZE, curl_resolve)
218 if html is not None:
219 return html
220 errors.append(f"{profile}: empty response")
221 except FetchError:
222 raise
223 except ValueError as e:
224 raise FetchError(str(e))
225 except Exception as e:
226 errors.append(f"{profile}: {str(e)}")
227 continue
229 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}")
231 async def _fetch_with_redirects(self, url, profile, max_size, curl_resolve=None):
232 """Fetch URL following redirects with per-hop SSRF validation and DNS pinning."""
233 from curl_cffi import CurlOpt
235 current_url = url
236 current_resolve = curl_resolve or []
237 for _ in range(MAX_REDIRECT_HOPS):
238 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {}
239 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session:
240 response = await session.get(
241 current_url,
242 timeout=self.timeout,
243 allow_redirects=False,
244 )
246 if response.status_code in (301, 302, 303, 307, 308):
247 location = response.headers.get("location")
248 if not location:
249 raise FetchError("Redirect without Location header")
250 resolved = validate_redirect_url(location)
251 current_url = location
252 current_resolve = resolved.curl_resolve
253 continue
255 if response.status_code == 200:
256 if not check_response_size(response, max_size):
257 raise FetchError(f"Response too large (Content-Length > {max_size})")
258 content = response.text
259 check_content_size(content.encode("utf-8", errors="replace"), max_size)
260 return content
262 return None
264 raise FetchError(f"Too many redirects (>{MAX_REDIRECT_HOPS})")
266 def _parse_recipe(self, html: str, url: str) -> dict:
267 """
268 Parse recipe data from HTML using recipe-scrapers.
269 """
270 try:
271 # supported_only=False allows scraping from any domain using schema.org
272 scraper = scrape_html(html, org_url=url, supported_only=False)
273 except Exception as e:
274 raise ParseError(f"Failed to parse recipe: {str(e)}")
276 # Extract host from URL
277 parsed_url = urlparse(url)
278 host = parsed_url.netloc.replace("www.", "")
280 # Build recipe data dict with safe attribute access
281 data = {
282 "host": host,
283 "title": self._safe_get(scraper, "title", ""),
284 "canonical_url": self._safe_get(scraper, "canonical_url", ""),
285 "site_name": self._safe_get(scraper, "site_name", ""),
286 "author": self._safe_get(scraper, "author", ""),
287 "description": self._safe_get(scraper, "description", ""),
288 "image_url": self._safe_get(scraper, "image", ""),
289 "ingredients": self._safe_get(scraper, "ingredients", []),
290 "ingredient_groups": self._safe_get_ingredient_groups(scraper),
291 "instructions": self._safe_get(scraper, "instructions_list", []),
292 "instructions_text": self._safe_get(scraper, "instructions", ""),
293 "prep_time": self._parse_time(self._safe_get(scraper, "prep_time")),
294 "cook_time": self._parse_time(self._safe_get(scraper, "cook_time")),
295 "total_time": self._parse_time(self._safe_get(scraper, "total_time")),
296 "yields": self._safe_get(scraper, "yields", ""),
297 "servings": self._parse_servings(self._safe_get(scraper, "yields", "")),
298 "category": self._safe_get(scraper, "category", ""),
299 "cuisine": self._safe_get(scraper, "cuisine", ""),
300 "cooking_method": self._safe_get(scraper, "cooking_method", ""),
301 "keywords": self._safe_get(scraper, "keywords", []),
302 "dietary_restrictions": self._safe_get(scraper, "dietary_restrictions", []),
303 "equipment": self._safe_get(scraper, "equipment", []),
304 "nutrition": self._safe_get(scraper, "nutrients", {}),
305 "rating": self._parse_rating(self._safe_get(scraper, "ratings")),
306 "rating_count": self._parse_rating_count(self._safe_get(scraper, "ratings_count")),
307 "language": self._safe_get(scraper, "language", ""),
308 "links": self._safe_get(scraper, "links", []),
309 }
311 if not data["title"]:
312 raise ParseError("Recipe has no title")
314 # Sanitize all text fields to strip HTML (defense-in-depth against stored XSS)
315 sanitize_recipe_data(data)
317 return data
319 def _safe_get(self, scraper, attr: str, default=None):
320 """Safely get an attribute from the scraper."""
321 try:
322 method = getattr(scraper, attr, None)
323 if callable(method):
324 result = method()
325 return result if result is not None else default
326 return default
327 except Exception:
328 logger.debug("Failed to get %s from scraper", attr, exc_info=True)
329 return default
331 def _safe_get_ingredient_groups(self, scraper) -> list:
332 """Get ingredient groups if available."""
333 try:
334 groups = scraper.ingredient_groups()
335 if groups:
336 return [
337 {"purpose": getattr(g, "purpose", ""), "ingredients": getattr(g, "ingredients", [])} for g in groups
338 ]
339 except Exception:
340 logger.warning("Failed to get ingredient groups from scraper", exc_info=True)
341 return []
343 def _parse_time(self, value) -> int | None:
344 """Parse time value to minutes."""
345 if value is None:
346 return None
347 if isinstance(value, (int, float)):
348 return int(value)
349 if isinstance(value, str):
350 # Try to extract number
351 match = re.search(r"(\d+)", value)
352 if match:
353 return int(match.group(1))
354 return None
356 def _parse_servings(self, yields: str) -> int | None:
357 """Extract serving count from yields string."""
358 if not yields:
359 return None
360 match = re.search(r"(\d+)", yields)
361 if match:
362 return int(match.group(1))
363 return None
365 def _parse_rating(self, value) -> float | None:
366 """Parse rating value to float."""
367 if value is None:
368 return None
369 try:
370 return float(value)
371 except (ValueError, TypeError):
372 return None
374 def _parse_rating_count(self, value) -> int | None:
375 """Parse rating count to int."""
376 if value is None:
377 return None
378 try:
379 return int(value)
380 except (ValueError, TypeError):
381 return None
383 async def _download_image(self, image_url: str) -> ContentFile | None:
384 """
385 Download recipe image and return as ContentFile.
387 Validates image URL against SSRF blocklist before fetching.
388 Follows redirects manually with per-hop validation (max 5 hops).
389 Enforces response size limit (50MB).
390 WebP images are converted to JPEG for iOS 9 compatibility.
391 """
392 if not image_url:
393 return None
395 # Validate image URL for SSRF protection (FR-001)
396 try:
397 resolved = validate_url(image_url)
398 except ValueError:
399 logger.warning("Blocked image URL (SSRF): %s", image_url)
400 return None
402 for profile in BROWSER_PROFILES:
403 try:
404 content = await self._fetch_image_with_redirects(image_url, profile, resolved.curl_resolve)
405 if content is not None:
406 content = self._convert_webp_to_jpeg(content)
407 return ContentFile(content)
408 except Exception as e:
409 logger.warning(
410 "Failed to download image %s with %s: %s",
411 image_url,
412 profile,
413 e,
414 )
415 continue
417 return None
419 async def _fetch_image_with_redirects(self, url, profile, curl_resolve=None):
420 """Fetch image following redirects with per-hop SSRF validation and DNS pinning."""
421 from curl_cffi import CurlOpt
423 current_url = url
424 current_resolve = curl_resolve or []
425 for _ in range(MAX_REDIRECT_HOPS):
426 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {}
427 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session:
428 response = await session.get(
429 current_url,
430 timeout=self.timeout,
431 allow_redirects=False,
432 )
434 if response.status_code in (301, 302, 303, 307, 308):
435 location = response.headers.get("location")
436 if not location:
437 return None
438 try:
439 resolved = validate_redirect_url(location)
440 except ValueError:
441 return None
442 current_url = location
443 current_resolve = resolved.curl_resolve
444 continue
446 if response.status_code == 200:
447 content_type = response.headers.get("content-type", "")
448 if "image" not in content_type and not self._is_image_url(current_url):
449 return None
450 if not check_response_size(response, MAX_IMAGE_SIZE):
451 logger.warning("Image too large: %s", current_url)
452 return None
453 content = response.content
454 if len(content) > MAX_IMAGE_SIZE:
455 logger.warning("Image content too large: %s", current_url)
456 return None
457 return content
459 return None
461 logger.warning("Too many redirects for image: %s", url)
462 return None
464 def _convert_webp_to_jpeg(self, content: bytes) -> bytes:
465 """Convert WebP images to JPEG for iOS 9 compatibility.
467 Also resizes very large images to reduce file size.
468 Rejects images that exceed the size limit (decompression bomb protection).
469 """
470 if len(content) > MAX_IMAGE_SIZE:
471 logger.warning("Image content too large for processing: %d bytes", len(content))
472 return content
474 try:
475 img = Image.open(BytesIO(content))
477 # Check if conversion is needed (WebP or very large)
478 needs_conversion = img.format == "WEBP"
479 needs_resize = img.width > 1200 or img.height > 1200
481 if not needs_conversion and not needs_resize:
482 return content
484 # Resize if too large (max 1200px on longest side)
485 if needs_resize:
486 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
488 # Convert to RGB if needed (for JPEG)
489 if img.mode in ("RGBA", "P"):
490 img = img.convert("RGB")
492 # Save as JPEG
493 output = BytesIO()
494 img.save(output, format="JPEG", quality=85, optimize=True)
495 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}")
496 return output.getvalue()
498 except Exception as e:
499 logger.warning(f"Image conversion failed: {e}, using original")
500 return content
502 def _is_image_url(self, url: str) -> bool:
503 """Check if URL looks like an image."""
504 image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".webp")
505 parsed = urlparse(url)
506 return parsed.path.lower().endswith(image_extensions)
508 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str:
509 """Generate a unique filename for the recipe image.
511 Always uses .jpg extension since images are converted to JPEG
512 for iOS 9 compatibility.
513 """
514 # Create hash from URLs for uniqueness
515 url_hash = hashlib.md5(f"{recipe_url}{image_url}".encode(), usedforsecurity=False).hexdigest()[:12]
517 return f"recipe_{url_hash}.jpg"