Coverage for apps/recipes/services/scraper.py: 84%

1"""

2Recipe scraper service using curl_cffi and recipe-scrapers.

3"""

5import hashlib

6import logging

7import re

8import threading

9from io import BytesIO

10from pathlib import Path

11from urllib.parse import urlparse

13from PIL import Image

14from asgiref.sync import sync_to_async

15from django.conf import settings

16from django.core.files.base import ContentFile

17from django.utils import timezone

18from curl_cffi.requests import AsyncSession

19from recipe_scrapers import scrape_html

21from apps.recipes.services.fingerprint import BROWSER_PROFILES

23logger = logging.getLogger(__name__)

26class ScraperError(Exception):

27 """Base exception for scraper errors."""

29 pass

32class FetchError(ScraperError):

33 """Failed to fetch URL."""

35 pass

38class ParseError(ScraperError):

39 """Failed to parse recipe from HTML."""

41 pass

44class RecipeScraper:

45 """

46 Async recipe scraper with browser fingerprint impersonation.

48 Uses curl_cffi to bypass anti-bot measures and recipe-scrapers

49 to parse structured recipe data from HTML.

51 Browser profiles are centralized in fingerprint.py for maintainability.

52 """

54 DEFAULT_TIMEOUT = 30

56 def __init__(self):

57 self.timeout = self.DEFAULT_TIMEOUT

59 async def scrape_url(self, url: str, profile: "Profile") -> "Recipe":

60 """

61 Scrape a recipe from a URL and save it to the database.

63 Args:

64 url: The recipe URL to scrape

65 profile: The profile that will own this recipe

67 Returns:

68 Recipe model instance

70 Raises:

71 FetchError: If the URL cannot be fetched

72 ParseError: If the HTML cannot be parsed as a recipe

73 """

74 # Import here to avoid circular imports

75 from apps.recipes.models import Recipe

77 # Fetch HTML

78 html = await self._fetch_html(url)

80 # Parse recipe data

81 data = self._parse_recipe(html, url)

83 # Check for cached search image first, then download if needed

84 image_file = None

85 if data.get("image_url"):

86 # Try to reuse cached image from search results

87 from apps.recipes.models import CachedSearchImage

89 try:

90 cached = await sync_to_async(CachedSearchImage.objects.get)(

91 external_url=data["image_url"], status=CachedSearchImage.STATUS_SUCCESS

92 )

94 if cached.image:

95 # Reuse cached image file

96 with cached.image.open("rb") as f:

97 image_file = ContentFile(f.read())

99 # Update access time to prevent cleanup

100 cached.last_accessed_at = timezone.now()

101 await sync_to_async(cached.save)(update_fields=["last_accessed_at"])

102

103 logger.info(f"Reused cached image for {data['image_url']}")

104

105 except CachedSearchImage.DoesNotExist:

106 pass

107

108 # If no cache, download as normal

109 if not image_file:

110 image_file = await self._download_image(data["image_url"])

111

112 # Create recipe record

113 recipe = Recipe(

114 profile=profile,

115 source_url=url,

116 canonical_url=data.get("canonical_url", ""),

117 host=data["host"],

118 site_name=data.get("site_name", ""),

119 title=data["title"],

120 author=data.get("author", ""),

121 description=data.get("description", ""),

122 image_url=data.get("image_url", ""),

123 ingredients=data.get("ingredients", []),

124 ingredient_groups=data.get("ingredient_groups", []),

125 instructions=data.get("instructions", []),

126 instructions_text=data.get("instructions_text", ""),

127 prep_time=data.get("prep_time"),

128 cook_time=data.get("cook_time"),

129 total_time=data.get("total_time"),

130 yields=data.get("yields", ""),

131 servings=data.get("servings"),

132 category=data.get("category", ""),

133 cuisine=data.get("cuisine", ""),

134 cooking_method=data.get("cooking_method", ""),

135 keywords=data.get("keywords", []),

136 dietary_restrictions=data.get("dietary_restrictions", []),

137 equipment=data.get("equipment", []),

138 nutrition=data.get("nutrition", {}),

139 rating=data.get("rating"),

140 rating_count=data.get("rating_count"),

141 language=data.get("language", ""),

142 links=data.get("links", []),

143 )

144

145 # Save first to get an ID for the image path

146 await sync_to_async(recipe.save)()

147

148 # Attach image if downloaded

149 if image_file:

150 filename = self._generate_image_filename(url, data.get("image_url", ""))

151 await sync_to_async(recipe.image.save)(filename, image_file, save=True)

152

153 # Fire-and-forget: Generate AI tips in background thread (non-blocking)

154 thread = threading.Thread(target=self._generate_tips_background, args=(recipe.id,), daemon=True)

155 thread.start()

156

157 return recipe

158

159 def _generate_tips_background(self, recipe_id: int):

160 """Generate AI tips for a recipe in background thread."""

161 try:

162 import django

163

164 django.setup() # Ensure Django is configured in thread

165

166 from apps.core.models import AppSettings

167 from apps.ai.services.tips import generate_tips

168

169 # Check if AI is available

170 settings_obj = AppSettings.get()

171 if not settings_obj.openrouter_api_key:

172 logger.debug(f"Skipping tips generation for recipe {recipe_id}: No API key")

173 return

174

175 # Generate tips

176 generate_tips(recipe_id)

177 logger.info(f"Auto-generated tips for recipe {recipe_id}")

178

179 except Exception as e:

180 # Log but don't fail - tips generation is optional

181 logger.warning(f"Failed to auto-generate tips for recipe {recipe_id}: {e}")

182

183 async def _fetch_html(self, url: str) -> str:

184 """

185 Fetch HTML from URL with browser impersonation.

186

187 Tries multiple browser profiles if initial request fails.

188 Browser profiles are configured in fingerprint.py.

189 """

190 errors = []

191

192 for profile in BROWSER_PROFILES:

193 try:

194 async with AsyncSession(impersonate=profile) as session:

195 response = await session.get(

196 url,

197 timeout=self.timeout,

198 allow_redirects=True,

199 )

200

201 if response.status_code == 200:

202 return response.text

203

204 errors.append(f"{profile}: HTTP {response.status_code}")

205

206 except Exception as e:

207 errors.append(f"{profile}: {str(e)}")

208 continue

209

210 raise FetchError(f"Failed to fetch {url}: {'; '.join(errors)}")

211

212 def _parse_recipe(self, html: str, url: str) -> dict:

213 """

214 Parse recipe data from HTML using recipe-scrapers.

215 """

216 try:

217 # supported_only=False allows scraping from any domain using schema.org

218 scraper = scrape_html(html, org_url=url, supported_only=False)

219 except Exception as e:

220 raise ParseError(f"Failed to parse recipe: {str(e)}")

221

222 # Extract host from URL

223 parsed_url = urlparse(url)

224 host = parsed_url.netloc.replace("www.", "")

225

226 # Build recipe data dict with safe attribute access

227 data = {

228 "host": host,

229 "title": self._safe_get(scraper, "title", ""),

230 "canonical_url": self._safe_get(scraper, "canonical_url", ""),

231 "site_name": self._safe_get(scraper, "site_name", ""),

232 "author": self._safe_get(scraper, "author", ""),

233 "description": self._safe_get(scraper, "description", ""),

234 "image_url": self._safe_get(scraper, "image", ""),

235 "ingredients": self._safe_get(scraper, "ingredients", []),

236 "ingredient_groups": self._safe_get_ingredient_groups(scraper),

237 "instructions": self._safe_get(scraper, "instructions_list", []),

238 "instructions_text": self._safe_get(scraper, "instructions", ""),

239 "prep_time": self._parse_time(self._safe_get(scraper, "prep_time")),

240 "cook_time": self._parse_time(self._safe_get(scraper, "cook_time")),

241 "total_time": self._parse_time(self._safe_get(scraper, "total_time")),

242 "yields": self._safe_get(scraper, "yields", ""),

243 "servings": self._parse_servings(self._safe_get(scraper, "yields", "")),

244 "category": self._safe_get(scraper, "category", ""),

245 "cuisine": self._safe_get(scraper, "cuisine", ""),

246 "cooking_method": self._safe_get(scraper, "cooking_method", ""),

247 "keywords": self._safe_get(scraper, "keywords", []),

248 "dietary_restrictions": self._safe_get(scraper, "dietary_restrictions", []),

249 "equipment": self._safe_get(scraper, "equipment", []),

250 "nutrition": self._safe_get(scraper, "nutrients", {}),

251 "rating": self._parse_rating(self._safe_get(scraper, "ratings")),

252 "rating_count": self._parse_rating_count(self._safe_get(scraper, "ratings_count")),

253 "language": self._safe_get(scraper, "language", ""),

254 "links": self._safe_get(scraper, "links", []),

255 }

256

257 if not data["title"]:

258 raise ParseError("Recipe has no title")

259

260 return data

261

262 def _safe_get(self, scraper, attr: str, default=None):

263 """Safely get an attribute from the scraper."""

264 try:

265 method = getattr(scraper, attr, None)

266 if callable(method):

267 result = method()

268 return result if result is not None else default

269 return default

270 except Exception:

271 return default

272

273 def _safe_get_ingredient_groups(self, scraper) -> list:

274 """Get ingredient groups if available."""

275 try:

276 groups = scraper.ingredient_groups()

277 if groups:

278 return [

279 {"purpose": getattr(g, "purpose", ""), "ingredients": getattr(g, "ingredients", [])} for g in groups

280 ]

281 except Exception:

282 pass

283 return []

284

285 def _parse_time(self, value) -> int | None:

286 """Parse time value to minutes."""

287 if value is None:

288 return None

289 if isinstance(value, (int, float)):

290 return int(value)

291 if isinstance(value, str):

292 # Try to extract number

293 match = re.search(r"(\d+)", value)

294 if match:

295 return int(match.group(1))

296 return None

297

298 def _parse_servings(self, yields: str) -> int | None:

299 """Extract serving count from yields string."""

300 if not yields:

301 return None

302 match = re.search(r"(\d+)", yields)

303 if match:

304 return int(match.group(1))

305 return None

306

307 def _parse_rating(self, value) -> float | None:

308 """Parse rating value to float."""

309 if value is None:

310 return None

311 try:

312 return float(value)

313 except (ValueError, TypeError):

314 return None

315

316 def _parse_rating_count(self, value) -> int | None:

317 """Parse rating count to int."""

318 if value is None:

319 return None

320 try:

321 return int(value)

322 except (ValueError, TypeError):

323 return None

324

325 async def _download_image(self, image_url: str) -> ContentFile | None:

326 """

327 Download recipe image and return as ContentFile.

328

329 WebP images are converted to JPEG for iOS 9 compatibility.

330 Tries multiple browser profiles if initial request fails.

331 """

332 if not image_url:

333 return None

334

335 # Try each browser profile until one succeeds

336 for profile in BROWSER_PROFILES:

337 try:

338 async with AsyncSession(impersonate=profile) as session:

339 response = await session.get(

340 image_url,

341 timeout=self.timeout,

342 allow_redirects=True,

343 )

344

345 if response.status_code == 200:

346 content_type = response.headers.get("content-type", "")

347 if "image" in content_type or self._is_image_url(image_url):

348 content = response.content

349 # Convert WebP to JPEG for iOS 9 compatibility

350 content = self._convert_webp_to_jpeg(content)

351 return ContentFile(content)

352

353 except Exception as e:

354 logger.warning(f"Failed to download image {image_url} with {profile}: {e}")

355 continue

356

357 return None

358

359 def _convert_webp_to_jpeg(self, content: bytes) -> bytes:

360 """Convert WebP images to JPEG for iOS 9 compatibility.

361

362 Also resizes very large images to reduce file size.

363 """

364 try:

365 img = Image.open(BytesIO(content))

366

367 # Check if conversion is needed (WebP or very large)

368 needs_conversion = img.format == "WEBP"

369 needs_resize = img.width > 1200 or img.height > 1200

370

371 if not needs_conversion and not needs_resize:

372 return content

373

374 # Resize if too large (max 1200px on longest side)

375 if needs_resize:

376 img.thumbnail((1200, 1200), Image.Resampling.LANCZOS)

377

378 # Convert to RGB if needed (for JPEG)

379 if img.mode in ("RGBA", "P"):

380 img = img.convert("RGB")

381

382 # Save as JPEG

383 output = BytesIO()

384 img.save(output, format="JPEG", quality=85, optimize=True)

385 logger.info(f"Converted image: {img.format} -> JPEG, resized: {needs_resize}")

386 return output.getvalue()

387

388 except Exception as e:

389 logger.warning(f"Image conversion failed: {e}, using original")

390 return content

391

392 def _is_image_url(self, url: str) -> bool:

393 """Check if URL looks like an image."""

394 image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".webp")

395 parsed = urlparse(url)

396 return parsed.path.lower().endswith(image_extensions)

397

398 def _generate_image_filename(self, recipe_url: str, image_url: str) -> str:

399 """Generate a unique filename for the recipe image.

400

401 Always uses .jpg extension since images are converted to JPEG

402 for iOS 9 compatibility.

403 """

404 # Create hash from URLs for uniqueness

405 url_hash = hashlib.md5(f"{recipe_url}{image_url}".encode()).hexdigest()[:12]

406

407 return f"recipe_{url_hash}.jpg"

Coverage for apps / recipes / services / scraper.py: 84%

181 statements