Coverage for apps / recipes / services / scraper.py: 84%
181 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 19:13 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 19:13 +0000
1"""
2Recipe scraper service using curl_cffi and recipe-scrapers.
3"""
5import hashlib
6import logging
7import re
8import threading
9from io import BytesIO
10from pathlib import Path
11from urllib.parse import urlparse
13from PIL import Image
14from asgiref.sync import sync_to_async
15from django.conf import settings
16from django.core.files.base import ContentFile
17from django.utils import timezone
18from curl_cffi.requests import AsyncSession
19from recipe_scrapers import scrape_html
21from apps.recipes.services.fingerprint import BROWSER_PROFILES
23logger = logging.getLogger(__name__)
26class ScraperError(Exception):
27 """Base exception for scraper errors."""
29 pass
32class FetchError(ScraperError):
33 """Failed to fetch URL."""
35 pass
38class ParseError(ScraperError):
39 """Failed to parse recipe from HTML."""
41 pass
44class RecipeScraper:
45 """
46 Async recipe scraper with browser fingerprint impersonation.
48 Uses curl_cffi to bypass anti-bot measures and recipe-scrapers
49 to parse structured recipe data from HTML.
51 Browser profiles are centralized in fingerprint.py for maintainability.
52 """
54 DEFAULT_TIMEOUT = 30
56 def __init__(self):
57 self.timeout = self.DEFAULT_TIMEOUT
59 async def scrape_url(self, url: str, profile: "Profile") -> "Recipe":
60 """
61 Scrape a recipe from a URL and save it to the database.
63 Args:
64 url: The recipe URL to scrape
65 profile: The profile that will own this recipe
67 Returns:
68 Recipe model instance
70 Raises:
71 FetchError: If the URL cannot be fetched
72 ParseError: If the HTML cannot be parsed as a recipe
73 """
74 # Import here to avoid circular imports
75 from apps.recipes.models import Recipe
77 # Fetch HTML
78 html = await self._fetch_html(url)
80 # Parse recipe data
81 data = self._parse_recipe(html, url)
83 # Check for cached search image first, then download if needed
84 image_file = None
85 if data.get("image_url"):
86 # Try to reuse cached image from search results
87 from apps.recipes.models import CachedSearchImage
89 try:
90 cached = await sync_to_async(CachedSearchImage.objects.get)(
91 external_url=data["image_url"], status=CachedSearchImage.STATUS_SUCCESS
92 )
94 if cached.image:
95 # Reuse cached image file
96 with cached.image.open("rb") as f:
97 image_file = ContentFile(f.read())
99 # Update access time to prevent cleanup
100 cached.last_accessed_at = timezone.now()
101 await sync_to_async(cached.save)(update_fields=["last_accessed_at"])
103 logger.info(f"Reused cached image for {data['image_url']}")
105 except CachedSearchImage.DoesNotExist:
106 pass
108 # If no cache, download as normal
109 if not image_file:
110 image_file = await self._download_image(data["image_url"])
112 # Create recipe record
113 recipe = Recipe(
114 profile=profile,
115 source_url=url,
116 canonical_url=data.get("canonical_url", ""),
117 host=data["host"],
118 site_name=data.get("site_name", ""),
119 title=data["title"],
120 author=data.get("author", ""),
121 description=data.get("description", ""),
122 image_url=data.get("image_url", ""),
123 ingredients=data.get("ingredients", []),
124 ingredient_groups=data.get("ingredient_groups", []),
125 instructions=data.get("instructions", []),
126 instructions_text=data.get("instructions_text", ""),
127 prep_time=data.get("prep_time"),
128 cook_time=data.get("cook_time"),
129 total_time=data.get("total_time"),
130 yields=data.get("yields", ""),
131 servings=data.get("servings"),
132 category=data.get("category", ""),
133 cuisine=data.get("cuisine", ""),
134 cooking_method=data.get("cooking_method", ""),
135 keywords=data.get("keywords", []),
136 dietary_restrictions=data.get("dietary_restrictions", []),
137 equipment=data.get("equipment", []),
138 nutrition=data.get("nutrition", {}),
139 rating=data.get("rating"),
140 rating_count=data.get("rating_count"),
141 language=data.get("language", ""),
142 links=data.get("links", []),
143 )
145 # Save first to get an ID for the image path
146 await sync_to_async(recipe.save)()
148 # Attach image if downloaded
149 if image_file:
150 filename = self._generate_image_filename(url, data.get("image_url", ""))
151 await sync_to_async(recipe.image.save)(filename, image_file, save=True)
153 # Fire-and-forget: Generate AI tips in background thread (non-blocking)
154 thread = threading.Thread(target=self._generate_tips_background, args=(recipe.id,), daemon=True)
155 thread.start()
157 return recipe
159 def _generate_tips_background(self, recipe_id: int):
160 """Generate AI tips for a recipe in background thread."""
161 try:
162 import django
164 django.setup() # Ensure Django is configured in thread
166 from apps.core.models import AppSettings
167 from apps.ai.services.tips import generate_tips
169 # Check if AI is available
170 settings_obj = AppSettings.get()
171 if not settings_obj.openrouter_api_key:
172 logger.debug(f"Skipping tips generation for recipe {recipe_id}: No API key")
173 return
175 # Generate tips
176 generate_tips(recipe_id)
177 logger.info(f"Auto-generated tips for recipe {recipe_id}")
179 except Exception as e:
180 # Log but don't fail - tips generation is optional
181 logger.warning(f"Failed to auto-generate tips for recipe {recipe_id}: {e}")
183 async def _fetch_html(self, url: str) -> str:
184 """
185 Fetch HTML from URL with browser impersonation.
187 Tries multiple browser profiles if initial request fails.
188 Browser profiles are configured in fingerprint.py.
189 """
190 errors = []
192 for profile in BROWSER_PROFILES:
193 try:
194 async with AsyncSession(impersonate=profile) as session:
195 response = await session.get(
196 url,
197 timeout=self.timeout,
198 allow_redirects=True,
199 )
201 if response.status_code == 200:
202 return response.text
204 errors.append(f"{profile}: HTTP {response.status_code}")
206 except Exception as e:
207 errors.append(f"{profile}: {str(e)}")
208 continue
210 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}")
212 def _parse_recipe(self, html: str, url: str) -> dict:
213 """
214 Parse recipe data from HTML using recipe-scrapers.
215 """
216 try:
217 # supported_only=False allows scraping from any domain using schema.org
218 scraper = scrape_html(html, org_url=url, supported_only=False)
219 except Exception as e:
220 raise ParseError(f"Failed to parse recipe: {str(e)}")
222 # Extract host from URL
223 parsed_url = urlparse(url)
224 host = parsed_url.netloc.replace("www.", "")
226 # Build recipe data dict with safe attribute access
227 data = {
228 "host": host,
229 "title": self._safe_get(scraper, "title", ""),
230 "canonical_url": self._safe_get(scraper, "canonical_url", ""),
231 "site_name": self._safe_get(scraper, "site_name", ""),
232 "author": self._safe_get(scraper, "author", ""),
233 "description": self._safe_get(scraper, "description", ""),
234 "image_url": self._safe_get(scraper, "image", ""),
235 "ingredients": self._safe_get(scraper, "ingredients", []),
236 "ingredient_groups": self._safe_get_ingredient_groups(scraper),
237 "instructions": self._safe_get(scraper, "instructions_list", []),
238 "instructions_text": self._safe_get(scraper, "instructions", ""),
239 "prep_time": self._parse_time(self._safe_get(scraper, "prep_time")),
240 "cook_time": self._parse_time(self._safe_get(scraper, "cook_time")),
241 "total_time": self._parse_time(self._safe_get(scraper, "total_time")),
242 "yields": self._safe_get(scraper, "yields", ""),
243 "servings": self._parse_servings(self._safe_get(scraper, "yields", "")),
244 "category": self._safe_get(scraper, "category", ""),
245 "cuisine": self._safe_get(scraper, "cuisine", ""),
246 "cooking_method": self._safe_get(scraper, "cooking_method", ""),
247 "keywords": self._safe_get(scraper, "keywords", []),
248 "dietary_restrictions": self._safe_get(scraper, "dietary_restrictions", []),
249 "equipment": self._safe_get(scraper, "equipment", []),
250 "nutrition": self._safe_get(scraper, "nutrients", {}),
251 "rating": self._parse_rating(self._safe_get(scraper, "ratings")),
252 "rating_count": self._parse_rating_count(self._safe_get(scraper, "ratings_count")),
253 "language": self._safe_get(scraper, "language", ""),
254 "links": self._safe_get(scraper, "links", []),
255 }
257 if not data["title"]:
258 raise ParseError("Recipe has no title")
260 return data
262 def _safe_get(self, scraper, attr: str, default=None):
263 """Safely get an attribute from the scraper."""
264 try:
265 method = getattr(scraper, attr, None)
266 if callable(method):
267 result = method()
268 return result if result is not None else default
269 return default
270 except Exception:
271 return default
273 def _safe_get_ingredient_groups(self, scraper) -> list:
274 """Get ingredient groups if available."""
275 try:
276 groups = scraper.ingredient_groups()
277 if groups:
278 return [
279 {"purpose": getattr(g, "purpose", ""), "ingredients": getattr(g, "ingredients", [])} for g in groups
280 ]
281 except Exception:
282 pass
283 return []
285 def _parse_time(self, value) -> int | None:
286 """Parse time value to minutes."""
287 if value is None:
288 return None
289 if isinstance(value, (int, float)):
290 return int(value)
291 if isinstance(value, str):
292 # Try to extract number
293 match = re.search(r"(\d+)", value)
294 if match:
295 return int(match.group(1))
296 return None
298 def _parse_servings(self, yields: str) -> int | None:
299 """Extract serving count from yields string."""
300 if not yields:
301 return None
302 match = re.search(r"(\d+)", yields)
303 if match:
304 return int(match.group(1))
305 return None
307 def _parse_rating(self, value) -> float | None:
308 """Parse rating value to float."""
309 if value is None:
310 return None
311 try:
312 return float(value)
313 except (ValueError, TypeError):
314 return None
316 def _parse_rating_count(self, value) -> int | None:
317 """Parse rating count to int."""
318 if value is None:
319 return None
320 try:
321 return int(value)
322 except (ValueError, TypeError):
323 return None
325 async def _download_image(self, image_url: str) -> ContentFile | None:
326 """
327 Download recipe image and return as ContentFile.
329 WebP images are converted to JPEG for iOS 9 compatibility.
330 Tries multiple browser profiles if initial request fails.
331 """
332 if not image_url:
333 return None
335 # Try each browser profile until one succeeds
336 for profile in BROWSER_PROFILES:
337 try:
338 async with AsyncSession(impersonate=profile) as session:
339 response = await session.get(
340 image_url,
341 timeout=self.timeout,
342 allow_redirects=True,
343 )
345 if response.status_code == 200:
346 content_type = response.headers.get("content-type", "")
347 if "image" in content_type or self._is_image_url(image_url):
348 content = response.content
349 # Convert WebP to JPEG for iOS 9 compatibility
350 content = self._convert_webp_to_jpeg(content)
351 return ContentFile(content)
353 except Exception as e:
354 logger.warning(f"Failed to download image {image_url} with {profile}: {e}")
355 continue
357 return None
359 def _convert_webp_to_jpeg(self, content: bytes) -> bytes:
360 """Convert WebP images to JPEG for iOS 9 compatibility.
362 Also resizes very large images to reduce file size.
363 """
364 try:
365 img = Image.open(BytesIO(content))
367 # Check if conversion is needed (WebP or very large)
368 needs_conversion = img.format == "WEBP"
369 needs_resize = img.width > 1200 or img.height > 1200
371 if not needs_conversion and not needs_resize:
372 return content
374 # Resize if too large (max 1200px on longest side)
375 if needs_resize:
376 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
378 # Convert to RGB if needed (for JPEG)
379 if img.mode in ("RGBA", "P"):
380 img = img.convert("RGB")
382 # Save as JPEG
383 output = BytesIO()
384 img.save(output, format="JPEG", quality=85, optimize=True)
385 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}")
386 return output.getvalue()
388 except Exception as e:
389 logger.warning(f"Image conversion failed: {e}, using original")
390 return content
392 def _is_image_url(self, url: str) -> bool:
393 """Check if URL looks like an image."""
394 image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".webp")
395 parsed = urlparse(url)
396 return parsed.path.lower().endswith(image_extensions)
398 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str:
399 """Generate a unique filename for the recipe image.
401 Always uses .jpg extension since images are converted to JPEG
402 for iOS 9 compatibility.
403 """
404 # Create hash from URLs for uniqueness
405 url_hash = hashlib.md5(f"{recipe_url}{image_url}".encode()).hexdigest()[:12]
407 return f"recipe_{url_hash}.jpg"