Coverage for apps / recipes / services / scraper.py: 84%
181 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 00:40 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 00:40 +0000
1"""
2Recipe scraper service using curl_cffi and recipe-scrapers.
3"""
5import hashlib
6import logging
7import re
8import threading
9from io import BytesIO
10from pathlib import Path
11from urllib.parse import urlparse
13from PIL import Image
14from asgiref.sync import sync_to_async
15from django.conf import settings
16from django.core.files.base import ContentFile
17from django.utils import timezone
18from curl_cffi.requests import AsyncSession
19from recipe_scrapers import scrape_html
21from apps.recipes.services.fingerprint import BROWSER_PROFILES
23logger = logging.getLogger(__name__)
26class ScraperError(Exception):
27 """Base exception for scraper errors."""
28 pass
31class FetchError(ScraperError):
32 """Failed to fetch URL."""
33 pass
36class ParseError(ScraperError):
37 """Failed to parse recipe from HTML."""
38 pass
41class RecipeScraper:
42 """
43 Async recipe scraper with browser fingerprint impersonation.
45 Uses curl_cffi to bypass anti-bot measures and recipe-scrapers
46 to parse structured recipe data from HTML.
48 Browser profiles are centralized in fingerprint.py for maintainability.
49 """
51 DEFAULT_TIMEOUT = 30
53 def __init__(self):
54 self.timeout = self.DEFAULT_TIMEOUT
56 async def scrape_url(self, url: str, profile: 'Profile') -> 'Recipe':
57 """
58 Scrape a recipe from a URL and save it to the database.
60 Args:
61 url: The recipe URL to scrape
62 profile: The profile that will own this recipe
64 Returns:
65 Recipe model instance
67 Raises:
68 FetchError: If the URL cannot be fetched
69 ParseError: If the HTML cannot be parsed as a recipe
70 """
71 # Import here to avoid circular imports
72 from apps.recipes.models import Recipe
74 # Fetch HTML
75 html = await self._fetch_html(url)
77 # Parse recipe data
78 data = self._parse_recipe(html, url)
80 # Check for cached search image first, then download if needed
81 image_file = None
82 if data.get('image_url'):
83 # Try to reuse cached image from search results
84 from apps.recipes.models import CachedSearchImage
86 try:
87 cached = await sync_to_async(
88 CachedSearchImage.objects.get
89 )(
90 external_url=data['image_url'],
91 status=CachedSearchImage.STATUS_SUCCESS
92 )
94 if cached.image:
95 # Reuse cached image file
96 with cached.image.open('rb') as f:
97 image_file = ContentFile(f.read())
99 # Update access time to prevent cleanup
100 cached.last_accessed_at = timezone.now()
101 await sync_to_async(cached.save)(update_fields=['last_accessed_at'])
103 logger.info(f"Reused cached image for {data['image_url']}")
105 except CachedSearchImage.DoesNotExist:
106 pass
108 # If no cache, download as normal
109 if not image_file:
110 image_file = await self._download_image(data['image_url'])
112 # Create recipe record
113 recipe = Recipe(
114 profile=profile,
115 source_url=url,
116 canonical_url=data.get('canonical_url', ''),
117 host=data['host'],
118 site_name=data.get('site_name', ''),
119 title=data['title'],
120 author=data.get('author', ''),
121 description=data.get('description', ''),
122 image_url=data.get('image_url', ''),
123 ingredients=data.get('ingredients', []),
124 ingredient_groups=data.get('ingredient_groups', []),
125 instructions=data.get('instructions', []),
126 instructions_text=data.get('instructions_text', ''),
127 prep_time=data.get('prep_time'),
128 cook_time=data.get('cook_time'),
129 total_time=data.get('total_time'),
130 yields=data.get('yields', ''),
131 servings=data.get('servings'),
132 category=data.get('category', ''),
133 cuisine=data.get('cuisine', ''),
134 cooking_method=data.get('cooking_method', ''),
135 keywords=data.get('keywords', []),
136 dietary_restrictions=data.get('dietary_restrictions', []),
137 equipment=data.get('equipment', []),
138 nutrition=data.get('nutrition', {}),
139 rating=data.get('rating'),
140 rating_count=data.get('rating_count'),
141 language=data.get('language', ''),
142 links=data.get('links', []),
143 )
145 # Save first to get an ID for the image path
146 await sync_to_async(recipe.save)()
148 # Attach image if downloaded
149 if image_file:
150 filename = self._generate_image_filename(url, data.get('image_url', ''))
151 await sync_to_async(recipe.image.save)(filename, image_file, save=True)
153 # Fire-and-forget: Generate AI tips in background thread (non-blocking)
154 thread = threading.Thread(
155 target=self._generate_tips_background,
156 args=(recipe.id,),
157 daemon=True
158 )
159 thread.start()
161 return recipe
163 def _generate_tips_background(self, recipe_id: int):
164 """Generate AI tips for a recipe in background thread."""
165 try:
166 import django
167 django.setup() # Ensure Django is configured in thread
169 from apps.core.models import AppSettings
170 from apps.ai.services.tips import generate_tips
172 # Check if AI is available
173 settings_obj = AppSettings.get()
174 if not settings_obj.openrouter_api_key:
175 logger.debug(f'Skipping tips generation for recipe {recipe_id}: No API key')
176 return
178 # Generate tips
179 generate_tips(recipe_id)
180 logger.info(f'Auto-generated tips for recipe {recipe_id}')
182 except Exception as e:
183 # Log but don't fail - tips generation is optional
184 logger.warning(f'Failed to auto-generate tips for recipe {recipe_id}: {e}')
186 async def _fetch_html(self, url: str) -> str:
187 """
188 Fetch HTML from URL with browser impersonation.
190 Tries multiple browser profiles if initial request fails.
191 Browser profiles are configured in fingerprint.py.
192 """
193 errors = []
195 for profile in BROWSER_PROFILES:
196 try:
197 async with AsyncSession(impersonate=profile) as session:
198 response = await session.get(
199 url,
200 timeout=self.timeout,
201 allow_redirects=True,
202 )
204 if response.status_code == 200:
205 return response.text
207 errors.append(f"{profile}: HTTP {response.status_code}")
209 except Exception as e:
210 errors.append(f"{profile}: {str(e)}")
211 continue
213 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}")
215 def _parse_recipe(self, html: str, url: str) -> dict:
216 """
217 Parse recipe data from HTML using recipe-scrapers.
218 """
219 try:
220 # supported_only=False allows scraping from any domain using schema.org
221 scraper = scrape_html(html, org_url=url, supported_only=False)
222 except Exception as e:
223 raise ParseError(f"Failed to parse recipe: {str(e)}")
225 # Extract host from URL
226 parsed_url = urlparse(url)
227 host = parsed_url.netloc.replace('www.', '')
229 # Build recipe data dict with safe attribute access
230 data = {
231 'host': host,
232 'title': self._safe_get(scraper, 'title', ''),
233 'canonical_url': self._safe_get(scraper, 'canonical_url', ''),
234 'site_name': self._safe_get(scraper, 'site_name', ''),
235 'author': self._safe_get(scraper, 'author', ''),
236 'description': self._safe_get(scraper, 'description', ''),
237 'image_url': self._safe_get(scraper, 'image', ''),
238 'ingredients': self._safe_get(scraper, 'ingredients', []),
239 'ingredient_groups': self._safe_get_ingredient_groups(scraper),
240 'instructions': self._safe_get(scraper, 'instructions_list', []),
241 'instructions_text': self._safe_get(scraper, 'instructions', ''),
242 'prep_time': self._parse_time(self._safe_get(scraper, 'prep_time')),
243 'cook_time': self._parse_time(self._safe_get(scraper, 'cook_time')),
244 'total_time': self._parse_time(self._safe_get(scraper, 'total_time')),
245 'yields': self._safe_get(scraper, 'yields', ''),
246 'servings': self._parse_servings(self._safe_get(scraper, 'yields', '')),
247 'category': self._safe_get(scraper, 'category', ''),
248 'cuisine': self._safe_get(scraper, 'cuisine', ''),
249 'cooking_method': self._safe_get(scraper, 'cooking_method', ''),
250 'keywords': self._safe_get(scraper, 'keywords', []),
251 'dietary_restrictions': self._safe_get(scraper, 'dietary_restrictions', []),
252 'equipment': self._safe_get(scraper, 'equipment', []),
253 'nutrition': self._safe_get(scraper, 'nutrients', {}),
254 'rating': self._parse_rating(self._safe_get(scraper, 'ratings')),
255 'rating_count': self._parse_rating_count(self._safe_get(scraper, 'ratings_count')),
256 'language': self._safe_get(scraper, 'language', ''),
257 'links': self._safe_get(scraper, 'links', []),
258 }
260 if not data['title']:
261 raise ParseError("Recipe has no title")
263 return data
265 def _safe_get(self, scraper, attr: str, default=None):
266 """Safely get an attribute from the scraper."""
267 try:
268 method = getattr(scraper, attr, None)
269 if callable(method):
270 result = method()
271 return result if result is not None else default
272 return default
273 except Exception:
274 return default
276 def _safe_get_ingredient_groups(self, scraper) -> list:
277 """Get ingredient groups if available."""
278 try:
279 groups = scraper.ingredient_groups()
280 if groups:
281 return [
282 {
283 'purpose': getattr(g, 'purpose', ''),
284 'ingredients': getattr(g, 'ingredients', [])
285 }
286 for g in groups
287 ]
288 except Exception:
289 pass
290 return []
292 def _parse_time(self, value) -> int | None:
293 """Parse time value to minutes."""
294 if value is None:
295 return None
296 if isinstance(value, (int, float)):
297 return int(value)
298 if isinstance(value, str):
299 # Try to extract number
300 match = re.search(r'(\d+)', value)
301 if match:
302 return int(match.group(1))
303 return None
305 def _parse_servings(self, yields: str) -> int | None:
306 """Extract serving count from yields string."""
307 if not yields:
308 return None
309 match = re.search(r'(\d+)', yields)
310 if match:
311 return int(match.group(1))
312 return None
314 def _parse_rating(self, value) -> float | None:
315 """Parse rating value to float."""
316 if value is None:
317 return None
318 try:
319 return float(value)
320 except (ValueError, TypeError):
321 return None
323 def _parse_rating_count(self, value) -> int | None:
324 """Parse rating count to int."""
325 if value is None:
326 return None
327 try:
328 return int(value)
329 except (ValueError, TypeError):
330 return None
332 async def _download_image(self, image_url: str) -> ContentFile | None:
333 """
334 Download recipe image and return as ContentFile.
336 WebP images are converted to JPEG for iOS 9 compatibility.
337 Tries multiple browser profiles if initial request fails.
338 """
339 if not image_url:
340 return None
342 # Try each browser profile until one succeeds
343 for profile in BROWSER_PROFILES:
344 try:
345 async with AsyncSession(impersonate=profile) as session:
346 response = await session.get(
347 image_url,
348 timeout=self.timeout,
349 allow_redirects=True,
350 )
352 if response.status_code == 200:
353 content_type = response.headers.get('content-type', '')
354 if 'image' in content_type or self._is_image_url(image_url):
355 content = response.content
356 # Convert WebP to JPEG for iOS 9 compatibility
357 content = self._convert_webp_to_jpeg(content)
358 return ContentFile(content)
360 except Exception as e:
361 logger.warning(f"Failed to download image {image_url} with {profile}: {e}")
362 continue
364 return None
366 def _convert_webp_to_jpeg(self, content: bytes) -> bytes:
367 """Convert WebP images to JPEG for iOS 9 compatibility.
369 Also resizes very large images to reduce file size.
370 """
371 try:
372 img = Image.open(BytesIO(content))
374 # Check if conversion is needed (WebP or very large)
375 needs_conversion = img.format == 'WEBP'
376 needs_resize = img.width > 1200 or img.height > 1200
378 if not needs_conversion and not needs_resize:
379 return content
381 # Resize if too large (max 1200px on longest side)
382 if needs_resize:
383 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS)
385 # Convert to RGB if needed (for JPEG)
386 if img.mode in ('RGBA', 'P'):
387 img = img.convert('RGB')
389 # Save as JPEG
390 output = BytesIO()
391 img.save(output, format='JPEG', quality=85, optimize=True)
392 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}")
393 return output.getvalue()
395 except Exception as e:
396 logger.warning(f"Image conversion failed: {e}, using original")
397 return content
399 def _is_image_url(self, url: str) -> bool:
400 """Check if URL looks like an image."""
401 image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.webp')
402 parsed = urlparse(url)
403 return parsed.path.lower().endswith(image_extensions)
405 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str:
406 """Generate a unique filename for the recipe image.
408 Always uses .jpg extension since images are converted to JPEG
409 for iOS 9 compatibility.
410 """
411 # Create hash from URLs for uniqueness
412 url_hash = hashlib.md5(
413 f"{recipe_url}{image_url}".encode()
414 ).hexdigest()[:12]
416 return f"recipe_{url_hash}.jpg"