Coverage for apps / recipes / services / scraper.py: 84%

181 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-11 00:40 +0000

1""" 

2Recipe scraper service using curl_cffi and recipe-scrapers. 

3""" 

4 

5import hashlib 

6import logging 

7import re 

8import threading 

9from io import BytesIO 

10from pathlib import Path 

11from urllib.parse import urlparse 

12 

13from PIL import Image 

14from asgiref.sync import sync_to_async 

15from django.conf import settings 

16from django.core.files.base import ContentFile 

17from django.utils import timezone 

18from curl_cffi.requests import AsyncSession 

19from recipe_scrapers import scrape_html 

20 

21from apps.recipes.services.fingerprint import BROWSER_PROFILES 

22 

23logger = logging.getLogger(__name__) 

24 

25 

26class ScraperError(Exception): 

27 """Base exception for scraper errors.""" 

28 pass 

29 

30 

31class FetchError(ScraperError): 

32 """Failed to fetch URL.""" 

33 pass 

34 

35 

36class ParseError(ScraperError): 

37 """Failed to parse recipe from HTML.""" 

38 pass 

39 

40 

41class RecipeScraper: 

42 """ 

43 Async recipe scraper with browser fingerprint impersonation. 

44 

45 Uses curl_cffi to bypass anti-bot measures and recipe-scrapers 

46 to parse structured recipe data from HTML. 

47 

48 Browser profiles are centralized in fingerprint.py for maintainability. 

49 """ 

50 

51 DEFAULT_TIMEOUT = 30 

52 

53 def __init__(self): 

54 self.timeout = self.DEFAULT_TIMEOUT 

55 

56 async def scrape_url(self, url: str, profile: 'Profile') -> 'Recipe': 

57 """ 

58 Scrape a recipe from a URL and save it to the database. 

59 

60 Args: 

61 url: The recipe URL to scrape 

62 profile: The profile that will own this recipe 

63 

64 Returns: 

65 Recipe model instance 

66 

67 Raises: 

68 FetchError: If the URL cannot be fetched 

69 ParseError: If the HTML cannot be parsed as a recipe 

70 """ 

71 # Import here to avoid circular imports 

72 from apps.recipes.models import Recipe 

73 

74 # Fetch HTML 

75 html = await self._fetch_html(url) 

76 

77 # Parse recipe data 

78 data = self._parse_recipe(html, url) 

79 

80 # Check for cached search image first, then download if needed 

81 image_file = None 

82 if data.get('image_url'): 

83 # Try to reuse cached image from search results 

84 from apps.recipes.models import CachedSearchImage 

85 

86 try: 

87 cached = await sync_to_async( 

88 CachedSearchImage.objects.get 

89 )( 

90 external_url=data['image_url'], 

91 status=CachedSearchImage.STATUS_SUCCESS 

92 ) 

93 

94 if cached.image: 

95 # Reuse cached image file 

96 with cached.image.open('rb') as f: 

97 image_file = ContentFile(f.read()) 

98 

99 # Update access time to prevent cleanup 

100 cached.last_accessed_at = timezone.now() 

101 await sync_to_async(cached.save)(update_fields=['last_accessed_at']) 

102 

103 logger.info(f"Reused cached image for {data['image_url']}") 

104 

105 except CachedSearchImage.DoesNotExist: 

106 pass 

107 

108 # If no cache, download as normal 

109 if not image_file: 

110 image_file = await self._download_image(data['image_url']) 

111 

112 # Create recipe record 

113 recipe = Recipe( 

114 profile=profile, 

115 source_url=url, 

116 canonical_url=data.get('canonical_url', ''), 

117 host=data['host'], 

118 site_name=data.get('site_name', ''), 

119 title=data['title'], 

120 author=data.get('author', ''), 

121 description=data.get('description', ''), 

122 image_url=data.get('image_url', ''), 

123 ingredients=data.get('ingredients', []), 

124 ingredient_groups=data.get('ingredient_groups', []), 

125 instructions=data.get('instructions', []), 

126 instructions_text=data.get('instructions_text', ''), 

127 prep_time=data.get('prep_time'), 

128 cook_time=data.get('cook_time'), 

129 total_time=data.get('total_time'), 

130 yields=data.get('yields', ''), 

131 servings=data.get('servings'), 

132 category=data.get('category', ''), 

133 cuisine=data.get('cuisine', ''), 

134 cooking_method=data.get('cooking_method', ''), 

135 keywords=data.get('keywords', []), 

136 dietary_restrictions=data.get('dietary_restrictions', []), 

137 equipment=data.get('equipment', []), 

138 nutrition=data.get('nutrition', {}), 

139 rating=data.get('rating'), 

140 rating_count=data.get('rating_count'), 

141 language=data.get('language', ''), 

142 links=data.get('links', []), 

143 ) 

144 

145 # Save first to get an ID for the image path 

146 await sync_to_async(recipe.save)() 

147 

148 # Attach image if downloaded 

149 if image_file: 

150 filename = self._generate_image_filename(url, data.get('image_url', '')) 

151 await sync_to_async(recipe.image.save)(filename, image_file, save=True) 

152 

153 # Fire-and-forget: Generate AI tips in background thread (non-blocking) 

154 thread = threading.Thread( 

155 target=self._generate_tips_background, 

156 args=(recipe.id,), 

157 daemon=True 

158 ) 

159 thread.start() 

160 

161 return recipe 

162 

163 def _generate_tips_background(self, recipe_id: int): 

164 """Generate AI tips for a recipe in background thread.""" 

165 try: 

166 import django 

167 django.setup() # Ensure Django is configured in thread 

168 

169 from apps.core.models import AppSettings 

170 from apps.ai.services.tips import generate_tips 

171 

172 # Check if AI is available 

173 settings_obj = AppSettings.get() 

174 if not settings_obj.openrouter_api_key: 

175 logger.debug(f'Skipping tips generation for recipe {recipe_id}: No API key') 

176 return 

177 

178 # Generate tips 

179 generate_tips(recipe_id) 

180 logger.info(f'Auto-generated tips for recipe {recipe_id}') 

181 

182 except Exception as e: 

183 # Log but don't fail - tips generation is optional 

184 logger.warning(f'Failed to auto-generate tips for recipe {recipe_id}: {e}') 

185 

186 async def _fetch_html(self, url: str) -> str: 

187 """ 

188 Fetch HTML from URL with browser impersonation. 

189 

190 Tries multiple browser profiles if initial request fails. 

191 Browser profiles are configured in fingerprint.py. 

192 """ 

193 errors = [] 

194 

195 for profile in BROWSER_PROFILES: 

196 try: 

197 async with AsyncSession(impersonate=profile) as session: 

198 response = await session.get( 

199 url, 

200 timeout=self.timeout, 

201 allow_redirects=True, 

202 ) 

203 

204 if response.status_code == 200: 

205 return response.text 

206 

207 errors.append(f"{profile}: HTTP {response.status_code}") 

208 

209 except Exception as e: 

210 errors.append(f"{profile}: {str(e)}") 

211 continue 

212 

213 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}") 

214 

215 def _parse_recipe(self, html: str, url: str) -> dict: 

216 """ 

217 Parse recipe data from HTML using recipe-scrapers. 

218 """ 

219 try: 

220 # supported_only=False allows scraping from any domain using schema.org 

221 scraper = scrape_html(html, org_url=url, supported_only=False) 

222 except Exception as e: 

223 raise ParseError(f"Failed to parse recipe: {str(e)}") 

224 

225 # Extract host from URL 

226 parsed_url = urlparse(url) 

227 host = parsed_url.netloc.replace('www.', '') 

228 

229 # Build recipe data dict with safe attribute access 

230 data = { 

231 'host': host, 

232 'title': self._safe_get(scraper, 'title', ''), 

233 'canonical_url': self._safe_get(scraper, 'canonical_url', ''), 

234 'site_name': self._safe_get(scraper, 'site_name', ''), 

235 'author': self._safe_get(scraper, 'author', ''), 

236 'description': self._safe_get(scraper, 'description', ''), 

237 'image_url': self._safe_get(scraper, 'image', ''), 

238 'ingredients': self._safe_get(scraper, 'ingredients', []), 

239 'ingredient_groups': self._safe_get_ingredient_groups(scraper), 

240 'instructions': self._safe_get(scraper, 'instructions_list', []), 

241 'instructions_text': self._safe_get(scraper, 'instructions', ''), 

242 'prep_time': self._parse_time(self._safe_get(scraper, 'prep_time')), 

243 'cook_time': self._parse_time(self._safe_get(scraper, 'cook_time')), 

244 'total_time': self._parse_time(self._safe_get(scraper, 'total_time')), 

245 'yields': self._safe_get(scraper, 'yields', ''), 

246 'servings': self._parse_servings(self._safe_get(scraper, 'yields', '')), 

247 'category': self._safe_get(scraper, 'category', ''), 

248 'cuisine': self._safe_get(scraper, 'cuisine', ''), 

249 'cooking_method': self._safe_get(scraper, 'cooking_method', ''), 

250 'keywords': self._safe_get(scraper, 'keywords', []), 

251 'dietary_restrictions': self._safe_get(scraper, 'dietary_restrictions', []), 

252 'equipment': self._safe_get(scraper, 'equipment', []), 

253 'nutrition': self._safe_get(scraper, 'nutrients', {}), 

254 'rating': self._parse_rating(self._safe_get(scraper, 'ratings')), 

255 'rating_count': self._parse_rating_count(self._safe_get(scraper, 'ratings_count')), 

256 'language': self._safe_get(scraper, 'language', ''), 

257 'links': self._safe_get(scraper, 'links', []), 

258 } 

259 

260 if not data['title']: 

261 raise ParseError("Recipe has no title") 

262 

263 return data 

264 

265 def _safe_get(self, scraper, attr: str, default=None): 

266 """Safely get an attribute from the scraper.""" 

267 try: 

268 method = getattr(scraper, attr, None) 

269 if callable(method): 

270 result = method() 

271 return result if result is not None else default 

272 return default 

273 except Exception: 

274 return default 

275 

276 def _safe_get_ingredient_groups(self, scraper) -> list: 

277 """Get ingredient groups if available.""" 

278 try: 

279 groups = scraper.ingredient_groups() 

280 if groups: 

281 return [ 

282 { 

283 'purpose': getattr(g, 'purpose', ''), 

284 'ingredients': getattr(g, 'ingredients', []) 

285 } 

286 for g in groups 

287 ] 

288 except Exception: 

289 pass 

290 return [] 

291 

292 def _parse_time(self, value) -> int | None: 

293 """Parse time value to minutes.""" 

294 if value is None: 

295 return None 

296 if isinstance(value, (int, float)): 

297 return int(value) 

298 if isinstance(value, str): 

299 # Try to extract number 

300 match = re.search(r'(\d+)', value) 

301 if match: 

302 return int(match.group(1)) 

303 return None 

304 

305 def _parse_servings(self, yields: str) -> int | None: 

306 """Extract serving count from yields string.""" 

307 if not yields: 

308 return None 

309 match = re.search(r'(\d+)', yields) 

310 if match: 

311 return int(match.group(1)) 

312 return None 

313 

314 def _parse_rating(self, value) -> float | None: 

315 """Parse rating value to float.""" 

316 if value is None: 

317 return None 

318 try: 

319 return float(value) 

320 except (ValueError, TypeError): 

321 return None 

322 

323 def _parse_rating_count(self, value) -> int | None: 

324 """Parse rating count to int.""" 

325 if value is None: 

326 return None 

327 try: 

328 return int(value) 

329 except (ValueError, TypeError): 

330 return None 

331 

332 async def _download_image(self, image_url: str) -> ContentFile | None: 

333 """ 

334 Download recipe image and return as ContentFile. 

335 

336 WebP images are converted to JPEG for iOS 9 compatibility. 

337 Tries multiple browser profiles if initial request fails. 

338 """ 

339 if not image_url: 

340 return None 

341 

342 # Try each browser profile until one succeeds 

343 for profile in BROWSER_PROFILES: 

344 try: 

345 async with AsyncSession(impersonate=profile) as session: 

346 response = await session.get( 

347 image_url, 

348 timeout=self.timeout, 

349 allow_redirects=True, 

350 ) 

351 

352 if response.status_code == 200: 

353 content_type = response.headers.get('content-type', '') 

354 if 'image' in content_type or self._is_image_url(image_url): 

355 content = response.content 

356 # Convert WebP to JPEG for iOS 9 compatibility 

357 content = self._convert_webp_to_jpeg(content) 

358 return ContentFile(content) 

359 

360 except Exception as e: 

361 logger.warning(f"Failed to download image {image_url} with {profile}: {e}") 

362 continue 

363 

364 return None 

365 

366 def _convert_webp_to_jpeg(self, content: bytes) -> bytes: 

367 """Convert WebP images to JPEG for iOS 9 compatibility. 

368 

369 Also resizes very large images to reduce file size. 

370 """ 

371 try: 

372 img = Image.open(BytesIO(content)) 

373 

374 # Check if conversion is needed (WebP or very large) 

375 needs_conversion = img.format == 'WEBP' 

376 needs_resize = img.width > 1200 or img.height > 1200 

377 

378 if not needs_conversion and not needs_resize: 

379 return content 

380 

381 # Resize if too large (max 1200px on longest side) 

382 if needs_resize: 

383 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS) 

384 

385 # Convert to RGB if needed (for JPEG) 

386 if img.mode in ('RGBA', 'P'): 

387 img = img.convert('RGB') 

388 

389 # Save as JPEG 

390 output = BytesIO() 

391 img.save(output, format='JPEG', quality=85, optimize=True) 

392 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}") 

393 return output.getvalue() 

394 

395 except Exception as e: 

396 logger.warning(f"Image conversion failed: {e}, using original") 

397 return content 

398 

399 def _is_image_url(self, url: str) -> bool: 

400 """Check if URL looks like an image.""" 

401 image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.webp') 

402 parsed = urlparse(url) 

403 return parsed.path.lower().endswith(image_extensions) 

404 

405 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str: 

406 """Generate a unique filename for the recipe image. 

407 

408 Always uses .jpg extension since images are converted to JPEG 

409 for iOS 9 compatibility. 

410 """ 

411 # Create hash from URLs for uniqueness 

412 url_hash = hashlib.md5( 

413 f"{recipe_url}{image_url}".encode() 

414 ).hexdigest()[:12] 

415 

416 return f"recipe_{url_hash}.jpg" 

← Back to Dashboard