Coverage for apps / recipes / services / scraper.py: 84%

181 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-14 19:13 +0000

1""" 

2Recipe scraper service using curl_cffi and recipe-scrapers. 

3""" 

4 

5import hashlib 

6import logging 

7import re 

8import threading 

9from io import BytesIO 

10from pathlib import Path 

11from urllib.parse import urlparse 

12 

13from PIL import Image 

14from asgiref.sync import sync_to_async 

15from django.conf import settings 

16from django.core.files.base import ContentFile 

17from django.utils import timezone 

18from curl_cffi.requests import AsyncSession 

19from recipe_scrapers import scrape_html 

20 

21from apps.recipes.services.fingerprint import BROWSER_PROFILES 

22 

23logger = logging.getLogger(__name__) 

24 

25 

26class ScraperError(Exception): 

27 """Base exception for scraper errors.""" 

28 

29 pass 

30 

31 

32class FetchError(ScraperError): 

33 """Failed to fetch URL.""" 

34 

35 pass 

36 

37 

38class ParseError(ScraperError): 

39 """Failed to parse recipe from HTML.""" 

40 

41 pass 

42 

43 

44class RecipeScraper: 

45 """ 

46 Async recipe scraper with browser fingerprint impersonation. 

47 

48 Uses curl_cffi to bypass anti-bot measures and recipe-scrapers 

49 to parse structured recipe data from HTML. 

50 

51 Browser profiles are centralized in fingerprint.py for maintainability. 

52 """ 

53 

54 DEFAULT_TIMEOUT = 30 

55 

56 def __init__(self): 

57 self.timeout = self.DEFAULT_TIMEOUT 

58 

59 async def scrape_url(self, url: str, profile: "Profile") -> "Recipe": 

60 """ 

61 Scrape a recipe from a URL and save it to the database. 

62 

63 Args: 

64 url: The recipe URL to scrape 

65 profile: The profile that will own this recipe 

66 

67 Returns: 

68 Recipe model instance 

69 

70 Raises: 

71 FetchError: If the URL cannot be fetched 

72 ParseError: If the HTML cannot be parsed as a recipe 

73 """ 

74 # Import here to avoid circular imports 

75 from apps.recipes.models import Recipe 

76 

77 # Fetch HTML 

78 html = await self._fetch_html(url) 

79 

80 # Parse recipe data 

81 data = self._parse_recipe(html, url) 

82 

83 # Check for cached search image first, then download if needed 

84 image_file = None 

85 if data.get("image_url"): 

86 # Try to reuse cached image from search results 

87 from apps.recipes.models import CachedSearchImage 

88 

89 try: 

90 cached = await sync_to_async(CachedSearchImage.objects.get)( 

91 external_url=data["image_url"], status=CachedSearchImage.STATUS_SUCCESS 

92 ) 

93 

94 if cached.image: 

95 # Reuse cached image file 

96 with cached.image.open("rb") as f: 

97 image_file = ContentFile(f.read()) 

98 

99 # Update access time to prevent cleanup 

100 cached.last_accessed_at = timezone.now() 

101 await sync_to_async(cached.save)(update_fields=["last_accessed_at"]) 

102 

103 logger.info(f"Reused cached image for {data['image_url']}") 

104 

105 except CachedSearchImage.DoesNotExist: 

106 pass 

107 

108 # If no cache, download as normal 

109 if not image_file: 

110 image_file = await self._download_image(data["image_url"]) 

111 

112 # Create recipe record 

113 recipe = Recipe( 

114 profile=profile, 

115 source_url=url, 

116 canonical_url=data.get("canonical_url", ""), 

117 host=data["host"], 

118 site_name=data.get("site_name", ""), 

119 title=data["title"], 

120 author=data.get("author", ""), 

121 description=data.get("description", ""), 

122 image_url=data.get("image_url", ""), 

123 ingredients=data.get("ingredients", []), 

124 ingredient_groups=data.get("ingredient_groups", []), 

125 instructions=data.get("instructions", []), 

126 instructions_text=data.get("instructions_text", ""), 

127 prep_time=data.get("prep_time"), 

128 cook_time=data.get("cook_time"), 

129 total_time=data.get("total_time"), 

130 yields=data.get("yields", ""), 

131 servings=data.get("servings"), 

132 category=data.get("category", ""), 

133 cuisine=data.get("cuisine", ""), 

134 cooking_method=data.get("cooking_method", ""), 

135 keywords=data.get("keywords", []), 

136 dietary_restrictions=data.get("dietary_restrictions", []), 

137 equipment=data.get("equipment", []), 

138 nutrition=data.get("nutrition", {}), 

139 rating=data.get("rating"), 

140 rating_count=data.get("rating_count"), 

141 language=data.get("language", ""), 

142 links=data.get("links", []), 

143 ) 

144 

145 # Save first to get an ID for the image path 

146 await sync_to_async(recipe.save)() 

147 

148 # Attach image if downloaded 

149 if image_file: 

150 filename = self._generate_image_filename(url, data.get("image_url", "")) 

151 await sync_to_async(recipe.image.save)(filename, image_file, save=True) 

152 

153 # Fire-and-forget: Generate AI tips in background thread (non-blocking) 

154 thread = threading.Thread(target=self._generate_tips_background, args=(recipe.id,), daemon=True) 

155 thread.start() 

156 

157 return recipe 

158 

159 def _generate_tips_background(self, recipe_id: int): 

160 """Generate AI tips for a recipe in background thread.""" 

161 try: 

162 import django 

163 

164 django.setup() # Ensure Django is configured in thread 

165 

166 from apps.core.models import AppSettings 

167 from apps.ai.services.tips import generate_tips 

168 

169 # Check if AI is available 

170 settings_obj = AppSettings.get() 

171 if not settings_obj.openrouter_api_key: 

172 logger.debug(f"Skipping tips generation for recipe {recipe_id}: No API key") 

173 return 

174 

175 # Generate tips 

176 generate_tips(recipe_id) 

177 logger.info(f"Auto-generated tips for recipe {recipe_id}") 

178 

179 except Exception as e: 

180 # Log but don't fail - tips generation is optional 

181 logger.warning(f"Failed to auto-generate tips for recipe {recipe_id}: {e}") 

182 

183 async def _fetch_html(self, url: str) -> str: 

184 """ 

185 Fetch HTML from URL with browser impersonation. 

186 

187 Tries multiple browser profiles if initial request fails. 

188 Browser profiles are configured in fingerprint.py. 

189 """ 

190 errors = [] 

191 

192 for profile in BROWSER_PROFILES: 

193 try: 

194 async with AsyncSession(impersonate=profile) as session: 

195 response = await session.get( 

196 url, 

197 timeout=self.timeout, 

198 allow_redirects=True, 

199 ) 

200 

201 if response.status_code == 200: 

202 return response.text 

203 

204 errors.append(f"{profile}: HTTP {response.status_code}") 

205 

206 except Exception as e: 

207 errors.append(f"{profile}: {str(e)}") 

208 continue 

209 

210 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}") 

211 

212 def _parse_recipe(self, html: str, url: str) -> dict: 

213 """ 

214 Parse recipe data from HTML using recipe-scrapers. 

215 """ 

216 try: 

217 # supported_only=False allows scraping from any domain using schema.org 

218 scraper = scrape_html(html, org_url=url, supported_only=False) 

219 except Exception as e: 

220 raise ParseError(f"Failed to parse recipe: {str(e)}") 

221 

222 # Extract host from URL 

223 parsed_url = urlparse(url) 

224 host = parsed_url.netloc.replace("www.", "") 

225 

226 # Build recipe data dict with safe attribute access 

227 data = { 

228 "host": host, 

229 "title": self._safe_get(scraper, "title", ""), 

230 "canonical_url": self._safe_get(scraper, "canonical_url", ""), 

231 "site_name": self._safe_get(scraper, "site_name", ""), 

232 "author": self._safe_get(scraper, "author", ""), 

233 "description": self._safe_get(scraper, "description", ""), 

234 "image_url": self._safe_get(scraper, "image", ""), 

235 "ingredients": self._safe_get(scraper, "ingredients", []), 

236 "ingredient_groups": self._safe_get_ingredient_groups(scraper), 

237 "instructions": self._safe_get(scraper, "instructions_list", []), 

238 "instructions_text": self._safe_get(scraper, "instructions", ""), 

239 "prep_time": self._parse_time(self._safe_get(scraper, "prep_time")), 

240 "cook_time": self._parse_time(self._safe_get(scraper, "cook_time")), 

241 "total_time": self._parse_time(self._safe_get(scraper, "total_time")), 

242 "yields": self._safe_get(scraper, "yields", ""), 

243 "servings": self._parse_servings(self._safe_get(scraper, "yields", "")), 

244 "category": self._safe_get(scraper, "category", ""), 

245 "cuisine": self._safe_get(scraper, "cuisine", ""), 

246 "cooking_method": self._safe_get(scraper, "cooking_method", ""), 

247 "keywords": self._safe_get(scraper, "keywords", []), 

248 "dietary_restrictions": self._safe_get(scraper, "dietary_restrictions", []), 

249 "equipment": self._safe_get(scraper, "equipment", []), 

250 "nutrition": self._safe_get(scraper, "nutrients", {}), 

251 "rating": self._parse_rating(self._safe_get(scraper, "ratings")), 

252 "rating_count": self._parse_rating_count(self._safe_get(scraper, "ratings_count")), 

253 "language": self._safe_get(scraper, "language", ""), 

254 "links": self._safe_get(scraper, "links", []), 

255 } 

256 

257 if not data["title"]: 

258 raise ParseError("Recipe has no title") 

259 

260 return data 

261 

262 def _safe_get(self, scraper, attr: str, default=None): 

263 """Safely get an attribute from the scraper.""" 

264 try: 

265 method = getattr(scraper, attr, None) 

266 if callable(method): 

267 result = method() 

268 return result if result is not None else default 

269 return default 

270 except Exception: 

271 return default 

272 

273 def _safe_get_ingredient_groups(self, scraper) -> list: 

274 """Get ingredient groups if available.""" 

275 try: 

276 groups = scraper.ingredient_groups() 

277 if groups: 

278 return [ 

279 {"purpose": getattr(g, "purpose", ""), "ingredients": getattr(g, "ingredients", [])} for g in groups 

280 ] 

281 except Exception: 

282 pass 

283 return [] 

284 

285 def _parse_time(self, value) -> int | None: 

286 """Parse time value to minutes.""" 

287 if value is None: 

288 return None 

289 if isinstance(value, (int, float)): 

290 return int(value) 

291 if isinstance(value, str): 

292 # Try to extract number 

293 match = re.search(r"(\d+)", value) 

294 if match: 

295 return int(match.group(1)) 

296 return None 

297 

298 def _parse_servings(self, yields: str) -> int | None: 

299 """Extract serving count from yields string.""" 

300 if not yields: 

301 return None 

302 match = re.search(r"(\d+)", yields) 

303 if match: 

304 return int(match.group(1)) 

305 return None 

306 

307 def _parse_rating(self, value) -> float | None: 

308 """Parse rating value to float.""" 

309 if value is None: 

310 return None 

311 try: 

312 return float(value) 

313 except (ValueError, TypeError): 

314 return None 

315 

316 def _parse_rating_count(self, value) -> int | None: 

317 """Parse rating count to int.""" 

318 if value is None: 

319 return None 

320 try: 

321 return int(value) 

322 except (ValueError, TypeError): 

323 return None 

324 

325 async def _download_image(self, image_url: str) -> ContentFile | None: 

326 """ 

327 Download recipe image and return as ContentFile. 

328 

329 WebP images are converted to JPEG for iOS 9 compatibility. 

330 Tries multiple browser profiles if initial request fails. 

331 """ 

332 if not image_url: 

333 return None 

334 

335 # Try each browser profile until one succeeds 

336 for profile in BROWSER_PROFILES: 

337 try: 

338 async with AsyncSession(impersonate=profile) as session: 

339 response = await session.get( 

340 image_url, 

341 timeout=self.timeout, 

342 allow_redirects=True, 

343 ) 

344 

345 if response.status_code == 200: 

346 content_type = response.headers.get("content-type", "") 

347 if "image" in content_type or self._is_image_url(image_url): 

348 content = response.content 

349 # Convert WebP to JPEG for iOS 9 compatibility 

350 content = self._convert_webp_to_jpeg(content) 

351 return ContentFile(content) 

352 

353 except Exception as e: 

354 logger.warning(f"Failed to download image {image_url} with {profile}: {e}") 

355 continue 

356 

357 return None 

358 

359 def _convert_webp_to_jpeg(self, content: bytes) -> bytes: 

360 """Convert WebP images to JPEG for iOS 9 compatibility. 

361 

362 Also resizes very large images to reduce file size. 

363 """ 

364 try: 

365 img = Image.open(BytesIO(content)) 

366 

367 # Check if conversion is needed (WebP or very large) 

368 needs_conversion = img.format == "WEBP" 

369 needs_resize = img.width > 1200 or img.height > 1200 

370 

371 if not needs_conversion and not needs_resize: 

372 return content 

373 

374 # Resize if too large (max 1200px on longest side) 

375 if needs_resize: 

376 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS) 

377 

378 # Convert to RGB if needed (for JPEG) 

379 if img.mode in ("RGBA", "P"): 

380 img = img.convert("RGB") 

381 

382 # Save as JPEG 

383 output = BytesIO() 

384 img.save(output, format="JPEG", quality=85, optimize=True) 

385 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}") 

386 return output.getvalue() 

387 

388 except Exception as e: 

389 logger.warning(f"Image conversion failed: {e}, using original") 

390 return content 

391 

392 def _is_image_url(self, url: str) -> bool: 

393 """Check if URL looks like an image.""" 

394 image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".webp") 

395 parsed = urlparse(url) 

396 return parsed.path.lower().endswith(image_extensions) 

397 

398 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str: 

399 """Generate a unique filename for the recipe image. 

400 

401 Always uses .jpg extension since images are converted to JPEG 

402 for iOS 9 compatibility. 

403 """ 

404 # Create hash from URLs for uniqueness 

405 url_hash = hashlib.md5(f"{recipe_url}{image_url}".encode()).hexdigest()[:12] 

406 

407 return f"recipe_{url_hash}.jpg" 

← Back to Dashboard