Coverage for apps / recipes / services / scraper.py: 75%
257 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 13:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 13:22 +0000
1"""
2Recipe scraper service using curl_cffi and recipe-scrapers.
3"""
5import hashlib
6import logging
7import re
8import threading
9from io import BytesIO
10from urllib.parse import urlparse
12from apps.recipes.services.sanitizer import sanitize_recipe_data
14from PIL import Image
15from asgiref.sync import sync_to_async
16from django.core.files.base import ContentFile
17from django.utils import timezone
18from curl_cffi.requests import AsyncSession
19from recipe_scrapers import scrape_html
21from apps.core.validators import (
22 MAX_HTML_SIZE,
23 MAX_IMAGE_SIZE,
24 MAX_REDIRECT_HOPS,
25 check_content_size,
26 check_response_size,
27 validate_url,
28 validate_redirect_url,
29)
30from apps.recipes.services.fingerprint import BROWSER_PROFILES
32# Limit decompression bomb attacks via PIL
33Image.MAX_IMAGE_PIXELS = 178_956_970 # ~180 megapixels
35logger = logging.getLogger(__name__)
38class ScraperError(Exception):
39 pass
42class FetchError(ScraperError):
43 pass
46class ParseError(ScraperError):
47 pass
50class RecipeScraper:
51 """Async recipe scraper with browser fingerprint impersonation."""
53 DEFAULT_TIMEOUT = 30
55 def __init__(self):
56 self.timeout = self.DEFAULT_TIMEOUT
58 async def scrape_url(self, url: str, profile: "Profile") -> "Recipe":
59 """
60 Scrape a recipe from a URL and save it to the database.
62 Args:
63 url: The recipe URL to scrape
64 profile: The profile that will own this recipe
66 Returns:
67 Recipe model instance
69 Raises:
70 FetchError: If the URL cannot be fetched
71 ParseError: If the HTML cannot be parsed as a recipe
72 """
73 # Import here to avoid circular imports
74 from apps.recipes.models import Recipe
76 # Validate URL for SSRF protection (returns pinned DNS resolution)
77 try:
78 resolved = validate_url(url)
79 except ValueError as e:
80 raise FetchError(str(e))
82 # Fetch HTML using pinned DNS to prevent TOCTOU rebinding
83 html = await self._fetch_html(url, resolved.curl_resolve)
85 # Parse recipe data
86 data = self._parse_recipe(html, url)
88 # Check for cached search image first, then download if needed
89 image_file = None
90 if data.get("image_url"):
91 # Try to reuse cached image from search results
92 from apps.recipes.models import CachedSearchImage
94 try:
95 cached = await sync_to_async(CachedSearchImage.objects.get)(
96 external_url=data["image_url"], status=CachedSearchImage.STATUS_SUCCESS
97 )
99 if cached.image:
100 # Reuse cached image file
101 with cached.image.open("rb") as f:
102 image_file = ContentFile(f.read())
104 # Update access time to prevent cleanup
105 cached.last_accessed_at = timezone.now()
106 await sync_to_async(cached.save)(update_fields=["last_accessed_at"])
108 logger.info(f"Reused cached image for {data['image_url']}")
110 except CachedSearchImage.DoesNotExist:
111 pass
113 # If no cache, download as normal
114 if not image_file:
115 image_file = await self._download_image(data["image_url"])
117 # Create recipe record
118 recipe = Recipe(
119 profile=profile,
120 source_url=url,
121 canonical_url=data.get("canonical_url", ""),
122 host=data["host"],
123 site_name=data.get("site_name", ""),
124 title=data["title"],
125 author=data.get("author", ""),
126 description=data.get("description", ""),
127 image_url=data.get("image_url", ""),
128 ingredients=data.get("ingredients", []),
129 ingredient_groups=data.get("ingredient_groups", []),
130 instructions=data.get("instructions", []),
131 instructions_text=data.get("instructions_text", ""),
132 prep_time=data.get("prep_time"),
133 cook_time=data.get("cook_time"),
134 total_time=data.get("total_time"),
135 yields=data.get("yields", ""),
136 servings=data.get("servings"),
137 category=data.get("category", ""),
138 cuisine=data.get("cuisine", ""),
139 cooking_method=data.get("cooking_method", ""),
140 keywords=data.get("keywords", []),
141 dietary_restrictions=data.get("dietary_restrictions", []),
142 equipment=data.get("equipment", []),
143 nutrition=data.get("nutrition", {}),
144 rating=data.get("rating"),
145 rating_count=data.get("rating_count"),
146 language=data.get("language", ""),
147 links=data.get("links", []),
148 )
150 # Save first to get an ID for the image path
151 await sync_to_async(recipe.save)()
153 # Attach image if downloaded
154 if image_file:
155 filename = self._generate_image_filename(url, data.get("image_url", ""))
156 await sync_to_async(recipe.image.save)(filename, image_file, save=True)
158 # Fire-and-forget: Generate AI tips in background thread (non-blocking)
159 thread = threading.Thread(target=self._generate_tips_background, args=(recipe.id,), daemon=True)
160 thread.start()
162 return recipe
164 def _generate_tips_background(self, recipe_id: int):
165 """Generate AI tips for a recipe in background thread."""
166 try:
167 import django
169 django.setup() # Ensure Django is configured in thread
171 from apps.core.models import AppSettings
172 from apps.ai.services.tips import generate_tips
174 # Check if AI is available
175 settings_obj = AppSettings.get()
176 if not settings_obj.openrouter_api_key:
177 logger.debug(f"Skipping tips generation for recipe {recipe_id}: No API key")
178 return
180 # Generate tips
181 generate_tips(recipe_id)
182 logger.info(f"Auto-generated tips for recipe {recipe_id}")
184 except Exception as e:
185 # Log but don't fail - tips generation is optional
186 logger.warning(f"Failed to auto-generate tips for recipe {recipe_id}: {e}")
188 async def _fetch_html(self, url: str, curl_resolve: list[str] | None = None) -> str:
189 """
190 Fetch HTML from URL with browser impersonation.
192 Follows redirects manually with per-hop SSRF validation (max 5 hops).
193 Enforces response size limit (10MB).
194 Tries multiple browser profiles if initial request fails.
196 Args:
197 url: URL to fetch
198 curl_resolve: DNS pinning list from validate_url to prevent TOCTOU rebinding
199 """
200 errors = []
202 for profile in BROWSER_PROFILES:
203 try:
204 html = await self._fetch_with_redirects(url, profile, MAX_HTML_SIZE, curl_resolve)
205 if html is not None:
206 return html
207 errors.append(f"{profile}: empty response")
208 except FetchError:
209 raise
210 except ValueError as e:
211 raise FetchError(str(e))
212 except Exception as e:
213 errors.append(f"{profile}: {str(e)}")
214 continue
216 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}")
218 async def _fetch_with_redirects(self, url, profile, max_size, curl_resolve=None):
219 """Fetch URL following redirects with per-hop SSRF validation and DNS pinning."""
220 from curl_cffi import CurlOpt
222 current_url = url
223 current_resolve = curl_resolve or []
224 for _ in range(MAX_REDIRECT_HOPS):
225 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {}
226 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session:
227 response = await session.get(
228 current_url,
229 timeout=self.timeout,
230 allow_redirects=False,
231 )
233 if response.status_code in (301, 302, 303, 307, 308):
234 location = response.headers.get("location")
235 if not location:
236 raise FetchError("Redirect without Location header")
237 resolved = validate_redirect_url(location)
238 current_url = location
239 current_resolve = resolved.curl_resolve
240 continue
242 if response.status_code == 200:
243 if not check_response_size(response, max_size):
244 raise FetchError(f"Response too large (Content-Length > {max_size})")
245 content = response.text
246 check_content_size(content.encode("utf-8", errors="replace"), max_size)
247 return content
249 if response.status_code == 404:
250 raise FetchError("Recipe page not found at that URL")
251 if response.status_code == 429:
252 raise FetchError("Recipe site is temporarily rate-limiting — try again shortly")
253 return None # 403/5xx: let other browser profiles try
255 raise FetchError(f"Too many redirects (>{MAX_REDIRECT_HOPS})")
257 def _parse_recipe(self, html: str, url: str) -> dict:
258 """
259 Parse recipe data from HTML using recipe-scrapers.
260 """
261 try:
262 # supported_only=False allows scraping from any domain using schema.org
263 scraper = scrape_html(html, org_url=url, supported_only=False)
264 except Exception as e:
265 raise ParseError(f"Failed to parse recipe: {str(e)}")
267 # Extract host from URL
268 parsed_url = urlparse(url)
269 host = parsed_url.netloc.replace("www.", "")
271 # Build recipe data dict with safe attribute access
272 data = {
273 "host": host,
274 "title": self._safe_get(scraper, "title", ""),
275 "canonical_url": self._safe_get(scraper, "canonical_url", ""),
276 "site_name": self._safe_get(scraper, "site_name", ""),
277 "author": self._safe_get(scraper, "author", ""),
278 "description": self._safe_get(scraper, "description", ""),
279 "image_url": self._safe_get(scraper, "image", ""),
280 "ingredients": self._safe_get(scraper, "ingredients", []),
281 "ingredient_groups": self._safe_get_ingredient_groups(scraper),
282 "instructions": self._safe_get(scraper, "instructions_list", []),
283 "instructions_text": self._safe_get(scraper, "instructions", ""),
284 "prep_time": self._parse_time(self._safe_get(scraper, "prep_time")),
285 "cook_time": self._parse_time(self._safe_get(scraper, "cook_time")),
286 "total_time": self._parse_time(self._safe_get(scraper, "total_time")),
287 "yields": self._safe_get(scraper, "yields", ""),
288 "servings": self._parse_servings(self._safe_get(scraper, "yields", "")),
289 "category": self._safe_get(scraper, "category", ""),
290 "cuisine": self._safe_get(scraper, "cuisine", ""),
291 "cooking_method": self._safe_get(scraper, "cooking_method", ""),
292 "keywords": self._safe_get(scraper, "keywords", []),
293 "dietary_restrictions": self._safe_get(scraper, "dietary_restrictions", []),
294 "equipment": self._safe_get(scraper, "equipment", []),
295 "nutrition": self._safe_get(scraper, "nutrients", {}),
296 "rating": self._parse_rating(self._safe_get(scraper, "ratings")),
297 "rating_count": self._parse_rating_count(self._safe_get(scraper, "ratings_count")),
298 "language": self._safe_get(scraper, "language", ""),
299 "links": self._safe_get(scraper, "links", []),
300 }
302 if not data["title"]:
303 raise ParseError("Recipe has no title")
305 # Sanitize all text fields to strip HTML (defense-in-depth against stored XSS)
306 sanitize_recipe_data(data)
308 return data
310 def _safe_get(self, scraper, attr: str, default=None):
311 """Safely get an attribute from the scraper."""
312 try:
313 method = getattr(scraper, attr, None)
314 if callable(method):
315 result = method()
316 return result if result is not None else default
317 return default
318 except Exception:
319 logger.debug("Failed to get %s from scraper", attr, exc_info=True)
320 return default
322 def _safe_get_ingredient_groups(self, scraper) -> list:
323 """Get ingredient groups if available."""
324 try:
325 groups = scraper.ingredient_groups()
326 if groups:
327 return [
328 {"purpose": getattr(g, "purpose", ""), "ingredients": getattr(g, "ingredients", [])} for g in groups
329 ]
330 except Exception:
331 logger.warning("Failed to get ingredient groups from scraper", exc_info=True)
332 return []
334 def _parse_time(self, value) -> int | None:
335 """Parse time value to minutes."""
336 if value is None:
337 return None
338 if isinstance(value, (int, float)):
339 return int(value)
340 if isinstance(value, str):
341 # Try to extract number
342 match = re.search(r"(\d+)", value)
343 if match:
344 return int(match.group(1))
345 return None
347 def _parse_servings(self, yields: str) -> int | None:
348 """Extract serving count from yields string."""
349 if not yields:
350 return None
351 match = re.search(r"(\d+)", yields)
352 if match:
353 return int(match.group(1))
354 return None
356 def _parse_rating(self, value) -> float | None:
357 """Parse rating value to float."""
358 if value is None:
359 return None
360 try:
361 return float(value)
362 except (ValueError, TypeError):
363 return None
365 def _parse_rating_count(self, value) -> int | None:
366 """Parse rating count to int."""
367 if value is None:
368 return None
369 try:
370 return int(value)
371 except (ValueError, TypeError):
372 return None
374 async def _download_image(self, image_url: str) -> ContentFile | None:
375 """
376 Download recipe image and return as ContentFile.
378 Validates image URL against SSRF blocklist before fetching.
379 Follows redirects manually with per-hop validation (max 5 hops).
380 Enforces response size limit (50MB).
381 WebP images are converted to JPEG for iOS 9 compatibility.
382 """
383 if not image_url:
384 return None
386 # Validate image URL for SSRF protection (FR-001)
387 try:
388 resolved = validate_url(image_url)
389 except ValueError:
390 logger.warning("Blocked image URL (SSRF): %s", image_url)
391 return None
393 for profile in BROWSER_PROFILES:
394 try:
395 content = await self._fetch_image_with_redirects(image_url, profile, resolved.curl_resolve)
396 if content is not None:
397 content = self._convert_webp_to_jpeg(content)
398 return ContentFile(content)
399 except Exception as e:
400 logger.warning(
401 "Failed to download image %s with %s: %s",
402 image_url,
403 profile,
404 e,
405 )
406 continue
408 return None
410 async def _fetch_image_with_redirects(self, url, profile, curl_resolve=None):
411 """Fetch image following redirects with per-hop SSRF validation and DNS pinning."""
412 from curl_cffi import CurlOpt
414 current_url = url
415 current_resolve = curl_resolve or []
416 for _ in range(MAX_REDIRECT_HOPS):
417 curl_opts = {CurlOpt.RESOLVE: current_resolve} if current_resolve else {}
418 async with AsyncSession(impersonate=profile, curl_options=curl_opts) as session:
419 response = await session.get(
420 current_url,
421 timeout=self.timeout,
422 allow_redirects=False,
423 )
425 if response.status_code in (301, 302, 303, 307, 308):
426 location = response.headers.get("location")
427 if not location:
428 return None
429 try:
430 resolved = validate_redirect_url(location)
431 except ValueError:
432 return None
433 current_url = location
434 current_resolve = resolved.curl_resolve
435 continue
437 if response.status_code == 200:
438 content_type = response.headers.get("content-type", "")
439 if "image" not in content_type and not self._is_image_url(current_url):
440 return None
441 if not check_response_size(response, MAX_IMAGE_SIZE):
442 logger.warning("Image too large: %s", current_url)
443 return None
444 content = response.content
445 if len(content) > MAX_IMAGE_SIZE:
446 logger.warning("Image content too large: %s", current_url)
447 return None
448 return content
450 return None
452 logger.warning("Too many redirects for image: %s", url)
453 return None
455 def _convert_webp_to_jpeg(self, content: bytes) -> bytes:
456 """Convert WebP images to JPEG for iOS 9 compatibility.
458 Also resizes very large images to reduce file size.
459 Rejects images that exceed the size limit (decompression bomb protection).
460 """
461 if len(content) > MAX_IMAGE_SIZE:
462 logger.warning("Image content too large for processing: %d bytes", len(content))
463 return content
465 try:
466 img = Image.open(BytesIO(content))
468 # Check if conversion is needed (WebP or very large)
469 needs_conversion = img.format == "WEBP"
470 needs_resize = img.width > 1200 or img.height > 1200
472 if not needs_conversion and not needs_resize:
473 return content
475 # Resize if too large (max 1200px on longest side)
476 if needs_resize:
477 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
479 # Convert to RGB if needed (for JPEG)
480 if img.mode in ("RGBA", "P"):
481 img = img.convert("RGB")
483 # Save as JPEG
484 output = BytesIO()
485 img.save(output, format="JPEG", quality=85, optimize=True)
486 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}")
487 return output.getvalue()
489 except Exception as e:
490 logger.warning(f"Image conversion failed: {e}, using original")
491 return content
493 def _is_image_url(self, url: str) -> bool:
494 """Check if URL looks like an image."""
495 image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".webp")
496 parsed = urlparse(url)
497 return parsed.path.lower().endswith(image_extensions)
499 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str:
500 """Generate a unique filename for the recipe image.
502 Always uses .jpg extension since images are converted to JPEG
503 for iOS 9 compatibility.
504 """
505 # Create hash from URLs for uniqueness
506 url_hash = hashlib.md5(f"{recipe_url}{image_url}".encode(), usedforsecurity=False).hexdigest()[:12]
508 return f"recipe_{url_hash}.jpg"