Coverage for apps / recipes / services / search.py: 86%
215 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 19:13 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-14 19:13 +0000
1"""
2Async multi-site recipe search service.
3"""
5import asyncio
6import logging
7import re
8from dataclasses import dataclass
9from datetime import datetime
10from typing import Optional
11from urllib.parse import quote_plus, urljoin, urlparse
13from asgiref.sync import sync_to_async
14from bs4 import BeautifulSoup
15from curl_cffi.requests import AsyncSession
16from django.utils import timezone
18from apps.recipes.services.fingerprint import (
19 BROWSER_PROFILES,
20 get_random_delay,
21)
23logger = logging.getLogger(__name__)
26@dataclass
27class SearchResult:
28 """A single search result from a recipe site."""
30 url: str
31 title: str
32 host: str
33 image_url: str = ""
34 description: str = ""
35 rating_count: Optional[int] = None
38class RecipeSearch:
39 """
40 Async recipe search service that queries multiple sites concurrently.
42 Uses curl_cffi with browser impersonation to fetch search pages,
43 then parses results using BeautifulSoup with site-specific selectors.
45 Browser profiles are centralized in fingerprint.py for maintainability.
46 """
48 MAX_CONCURRENT = 10
49 DEFAULT_TIMEOUT = 30
51 def __init__(self):
52 self.timeout = self.DEFAULT_TIMEOUT
54 async def search(
55 self,
56 query: str,
57 sources: Optional[list[str]] = None,
58 page: int = 1,
59 per_page: int = 20,
60 ) -> dict:
61 """
62 Search for recipes across multiple sites.
64 Args:
65 query: Search query string
66 sources: Optional list of hosts to search (None = all enabled)
67 page: Page number (1-indexed)
68 per_page: Results per page
70 Returns:
71 dict with keys:
72 - results: List of SearchResult dicts
73 - total: Total result count
74 - page: Current page
75 - has_more: Whether more results exist
76 - sites: Dict mapping host to result count
77 """
78 from apps.recipes.models import SearchSource
80 # Get enabled sources
81 get_sources = sync_to_async(lambda: list(SearchSource.objects.filter(is_enabled=True)))
82 enabled_sources = await get_sources()
84 # Filter by requested sources if specified
85 if sources:
86 enabled_sources = [s for s in enabled_sources if s.host in sources]
88 if not enabled_sources:
89 return {
90 "results": [],
91 "total": 0,
92 "page": page,
93 "has_more": False,
94 "sites": {},
95 }
97 # Create semaphore for concurrency control
98 semaphore = asyncio.Semaphore(self.MAX_CONCURRENT)
100 # Search all sources concurrently with primary browser profile
101 # If all sources fail, we try fallback profiles
102 primary_profile = BROWSER_PROFILES[0]
104 async with AsyncSession(impersonate=primary_profile) as session:
105 tasks = [self._search_source(session, semaphore, source, query) for source in enabled_sources]
106 results_by_source = await asyncio.gather(*tasks, return_exceptions=True)
108 # Aggregate results
109 all_results: list[SearchResult] = []
110 site_counts: dict[str, int] = {}
112 for source, result in zip(enabled_sources, results_by_source):
113 if isinstance(result, Exception):
114 logger.warning(f"Search failed for {source.host}: {result}")
115 await self._record_failure(source)
116 continue
118 site_counts[source.host] = len(result)
119 all_results.extend(result)
120 await self._record_success(source)
122 # Deduplicate by URL
123 seen_urls = set()
124 unique_results = []
125 for r in all_results:
126 if r.url not in seen_urls:
127 seen_urls.add(r.url)
128 unique_results.append(r)
130 # Convert to dict format for ranking
131 result_dicts = [
132 {
133 "url": r.url,
134 "title": r.title,
135 "host": r.host,
136 "image_url": r.image_url,
137 "description": r.description,
138 "rating_count": r.rating_count,
139 }
140 for r in unique_results
141 ]
143 # Apply AI ranking (optional, skips if unavailable)
144 result_dicts = await self._apply_ai_ranking(query, result_dicts)
146 # Paginate
147 total = len(result_dicts)
148 start = (page - 1) * per_page
149 end = start + per_page
150 paginated = result_dicts[start:end]
152 return {
153 "results": paginated,
154 "total": total,
155 "page": page,
156 "has_more": end < total,
157 "sites": site_counts,
158 }
160 async def _apply_ai_ranking(self, query: str, results: list[dict]) -> list[dict]:
161 """Apply AI ranking to search results (non-blocking).
163 Skips ranking if AI is unavailable or if it fails.
164 """
165 try:
166 from apps.ai.services.ranking import rank_results
168 ranked = await sync_to_async(rank_results)(query, results)
169 return ranked
170 except Exception as e:
171 logger.warning(f"AI ranking failed: {e}")
172 return results
174 async def _search_source(
175 self,
176 session: AsyncSession,
177 semaphore: asyncio.Semaphore,
178 source,
179 query: str,
180 ) -> list[SearchResult]:
181 """
182 Search a single source for recipes.
184 Uses randomized delays to avoid bot detection patterns.
185 """
186 async with semaphore:
187 # Add randomized delay to avoid predictable request patterns
188 await asyncio.sleep(get_random_delay())
189 # Build search URL
190 search_url = source.search_url_template.replace("{query}", quote_plus(query))
192 try:
193 response = await asyncio.wait_for(
194 session.get(
195 search_url,
196 timeout=self.timeout,
197 allow_redirects=True,
198 ),
199 timeout=self.timeout + 5, # Extra buffer for asyncio
200 )
202 if response.status_code != 200:
203 raise Exception(f"HTTP {response.status_code}")
205 return self._parse_search_results(
206 response.text,
207 source.host,
208 source.result_selector,
209 search_url,
210 )
212 except asyncio.TimeoutError:
213 raise Exception("Request timed out")
215 def _parse_search_results(
216 self,
217 html: str,
218 host: str,
219 selector: str,
220 base_url: str,
221 ) -> list[SearchResult]:
222 """
223 Parse search results from HTML.
225 Uses the site-specific CSS selector if available,
226 otherwise falls back to common patterns.
227 """
228 soup = BeautifulSoup(html, "html.parser")
229 results = []
231 # Try site-specific selector first
232 if selector:
233 elements = soup.select(selector)
234 if elements:
235 for el in elements[:20]: # Limit per site
236 result = self._extract_result_from_element(el, host, base_url)
237 if result:
238 results.append(result)
239 return results
241 # Fallback: Look for common recipe link patterns
242 results = self._fallback_parse(soup, host, base_url)
243 return results[:20] # Limit per site
245 def _find_link(self, element) -> Optional[tuple]:
246 """Find recipe link in an HTML element.
248 Returns:
249 Tuple of (link_element, url) if found, None otherwise.
250 """
251 link = element.find("a", href=True)
252 if not link:
253 link = element if element.name == "a" and element.get("href") else None
254 if not link:
255 return None
257 url = link.get("href", "")
258 if not url:
259 return None
261 return link, url
263 def _extract_title(self, element, link) -> str:
264 """Extract title from element with multiple fallback strategies.
266 Tries: heading elements, link text, title/aria-label attributes.
267 """
268 title_el = element.find(["h2", "h3", "h4", ".title", '[class*="title"]'])
269 if title_el:
270 title = title_el.get_text(strip=True)
271 if title:
272 return title
274 title = link.get_text(strip=True)
275 if title:
276 return title
278 return link.get("title", "") or link.get("aria-label", "")
280 def _extract_rating(self, title: str) -> tuple[str, Optional[int]]:
281 """Extract and strip rating count from title.
283 Handles patterns like "Recipe Name1,392Ratings".
285 Returns:
286 Tuple of (cleaned_title, rating_count).
287 """
288 rating_match = re.search(r"([\d,]+)\s*[Rr]atings?\s*$", title)
289 if not rating_match:
290 return title, None
292 rating_str = rating_match.group(1).replace(",", "")
293 try:
294 rating_count = int(rating_str)
295 cleaned_title = title[: rating_match.start()].strip()
296 return cleaned_title, rating_count
297 except ValueError:
298 return title, None
300 def _extract_image(self, element, base_url: str) -> str:
301 """Extract image URL with multiple fallback strategies.
303 Tries: src, data-src, data-lazy-src attributes.
304 """
305 img = element.find("img")
306 if not img:
307 return ""
309 image_url = img.get("src") or img.get("data-src") or img.get("data-lazy-src", "")
310 if image_url:
311 return urljoin(base_url, image_url)
312 return ""
314 def _extract_description(self, element) -> str:
315 """Extract description from element."""
316 desc_el = element.find(["p", ".description", '[class*="description"]'])
317 if desc_el:
318 return desc_el.get_text(strip=True)[:200]
319 return ""
321 def _extract_result_from_element(
322 self,
323 element,
324 host: str,
325 base_url: str,
326 ) -> Optional[SearchResult]:
327 """Extract search result data from an HTML element."""
328 # Find and validate link
329 link_result = self._find_link(element)
330 if not link_result:
331 return None
332 link, url = link_result
334 # Make URL absolute and validate
335 url = urljoin(base_url, url)
336 if not self._looks_like_recipe_url(url, host):
337 return None
339 # Extract title
340 title = self._extract_title(element, link)
341 if not title:
342 return None
344 # Extract and strip rating from title
345 title, rating_count = self._extract_rating(title)
347 # Title may have become empty after stripping rating (QA-053)
348 if not title:
349 return None
351 return SearchResult(
352 url=url,
353 title=title[:200],
354 host=host,
355 image_url=self._extract_image(element, base_url),
356 description=self._extract_description(element),
357 rating_count=rating_count,
358 )
360 def _fallback_parse(
361 self,
362 soup: BeautifulSoup,
363 host: str,
364 base_url: str,
365 ) -> list[SearchResult]:
366 """
367 Fallback parser for sites without a specific selector.
369 Looks for common patterns in recipe search results.
370 """
371 results = []
373 # Strategy 1: Look for article elements with links
374 for article in soup.find_all("article")[:30]:
375 result = self._extract_result_from_element(article, host, base_url)
376 if result:
377 results.append(result)
379 if results:
380 return results
382 # Strategy 2: Look for card-like divs
383 card_selectors = [
384 '[class*="recipe-card"]',
385 '[class*="card"]',
386 '[class*="result"]',
387 '[class*="item"]',
388 ]
389 for selector in card_selectors:
390 for card in soup.select(selector)[:30]:
391 result = self._extract_result_from_element(card, host, base_url)
392 if result:
393 results.append(result)
394 if results:
395 return results
397 # Strategy 3: Look for links that look like recipe URLs
398 for link in soup.find_all("a", href=True)[:100]:
399 url = urljoin(base_url, link.get("href", ""))
400 if self._looks_like_recipe_url(url, host):
401 title = link.get_text(strip=True)
402 if title and len(title) > 5:
403 results.append(
404 SearchResult(
405 url=url,
406 title=title[:200],
407 host=host,
408 )
409 )
411 return results
413 def _looks_like_recipe_url(self, url: str, host: str) -> bool:
414 """
415 Check if a URL looks like a recipe detail page.
416 """
417 parsed = urlparse(url)
419 # Must be from the expected host
420 if host not in parsed.netloc:
421 return False
423 path = parsed.path.lower()
425 # Common recipe URL patterns
426 recipe_patterns = [
427 r"/recipe[s]?/",
428 r"/dish/",
429 r"/food/",
430 r"/cooking/",
431 r"/\d+/", # Numeric ID in path
432 r"-recipe/?$", # URL ending with -recipe
433 r"/a\d+/", # Alphanumeric IDs like /a69912280/
434 r"/food-cooking/", # Pioneer Woman style
435 ]
437 # Exclude non-recipe paths
438 exclude_patterns = [
439 r"/search",
440 r"/tag/",
441 r"/category/",
442 r"/author/",
443 r"/profile/",
444 r"/user/",
445 r"/about",
446 r"/contact",
447 r"/privacy",
448 r"/terms",
449 r"/newsletter",
450 r"/subscribe",
451 # Article/blog paths (QA-053)
452 r"/article/",
453 r"/articles/",
454 r"/blog/",
455 r"/post/",
456 r"/posts/",
457 r"/news/",
458 r"/story/",
459 r"/stories/",
460 r"/feature/",
461 r"/features/",
462 r"/guide/",
463 r"/guides/",
464 r"/review/",
465 r"/reviews/",
466 r"/roundup/",
467 r"/list/",
468 r"/listicle/",
469 # Video paths (QA-053)
470 r"/video/",
471 r"/videos/",
472 r"/watch/",
473 r"/watch\?",
474 r"/embed/",
475 r"/player/",
476 r"/clip/",
477 r"/clips/",
478 r"/episode/",
479 r"/episodes/",
480 r"/series/",
481 r"/show/",
482 r"/shows/",
483 r"/gallery/",
484 r"/galleries/",
485 r"/slideshow/",
486 r"/photo-gallery/",
487 # Index/listing pages (QA-053)
488 r"/seasons?(?:/|$)",
489 r"/cuisines?(?:/|$)",
490 r"/ingredients?(?:/|$)",
491 r"/collections?(?:/|$)",
492 r"/occasions?(?:/|$)",
493 r"/courses?(?:/|$)",
494 r"/diets?(?:/|$)",
495 r"/techniques?(?:/|$)",
496 r"/chefs?(?:/|$)",
497 r"/dishes(?:/|$)",
498 r"/menus?(?:/|$)",
499 r"/meal-plans?(?:/|$)",
500 ]
502 for pattern in exclude_patterns:
503 if re.search(pattern, path):
504 return False
506 # Site-specific requirements (QA-058)
507 # AllRecipes has article pages at root that look like recipes but aren't
508 # Real recipes are always under /recipe/ path
509 if "allrecipes.com" in host and "/recipe/" not in path:
510 return False
512 # Check for recipe patterns
513 for pattern in recipe_patterns:
514 if re.search(pattern, path):
515 return True
517 # Heuristic: URL path has enough segments and isn't too short
518 segments = [s for s in path.split("/") if s]
519 if len(segments) >= 2 and len(path) > 20:
520 return True
522 # Also accept single-segment slug-style URLs (common for food blogs)
523 # e.g., /30-cloves-garlic-chicken/
524 if len(segments) == 1 and len(path) > 15 and path.count("-") >= 2:
525 return True
527 return False
529 async def _record_failure(self, source) -> None:
530 """Record a search failure for maintenance tracking."""
531 from apps.recipes.models import SearchSource
533 @sync_to_async
534 def update():
535 source.consecutive_failures += 1
536 if source.consecutive_failures >= 3:
537 source.needs_attention = True
538 source.save(update_fields=["consecutive_failures", "needs_attention"])
540 await update()
542 async def _record_success(self, source) -> None:
543 """Record a successful search."""
544 from apps.recipes.models import SearchSource
546 @sync_to_async
547 def update():
548 source.consecutive_failures = 0
549 source.needs_attention = False
550 source.last_validated_at = timezone.now()
551 source.save(
552 update_fields=[
553 "consecutive_failures",
554 "needs_attention",
555 "last_validated_at",
556 ]
557 )
559 await update()