Coverage for apps / recipes / services / search.py: 86%
200 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 00:40 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-11 00:40 +0000
1"""
2Async multi-site recipe search service.
3"""
5import asyncio
6import logging
7import re
8from dataclasses import dataclass
9from datetime import datetime
10from typing import Optional
11from urllib.parse import quote_plus, urljoin, urlparse
13from asgiref.sync import sync_to_async
14from bs4 import BeautifulSoup
15from curl_cffi.requests import AsyncSession
16from django.utils import timezone
18from apps.recipes.services.fingerprint import (
19 BROWSER_PROFILES,
20 get_random_delay,
21)
23logger = logging.getLogger(__name__)
26@dataclass
27class SearchResult:
28 """A single search result from a recipe site."""
29 url: str
30 title: str
31 host: str
32 image_url: str = ''
33 description: str = ''
34 rating_count: Optional[int] = None
37class RecipeSearch:
38 """
39 Async recipe search service that queries multiple sites concurrently.
41 Uses curl_cffi with browser impersonation to fetch search pages,
42 then parses results using BeautifulSoup with site-specific selectors.
44 Browser profiles are centralized in fingerprint.py for maintainability.
45 """
47 MAX_CONCURRENT = 10
48 DEFAULT_TIMEOUT = 30
50 def __init__(self):
51 self.timeout = self.DEFAULT_TIMEOUT
53 async def search(
54 self,
55 query: str,
56 sources: Optional[list[str]] = None,
57 page: int = 1,
58 per_page: int = 20,
59 ) -> dict:
60 """
61 Search for recipes across multiple sites.
63 Args:
64 query: Search query string
65 sources: Optional list of hosts to search (None = all enabled)
66 page: Page number (1-indexed)
67 per_page: Results per page
69 Returns:
70 dict with keys:
71 - results: List of SearchResult dicts
72 - total: Total result count
73 - page: Current page
74 - has_more: Whether more results exist
75 - sites: Dict mapping host to result count
76 """
77 from apps.recipes.models import SearchSource
79 # Get enabled sources
80 get_sources = sync_to_async(lambda: list(
81 SearchSource.objects.filter(is_enabled=True)
82 ))
83 enabled_sources = await get_sources()
85 # Filter by requested sources if specified
86 if sources:
87 enabled_sources = [s for s in enabled_sources if s.host in sources]
89 if not enabled_sources:
90 return {
91 'results': [],
92 'total': 0,
93 'page': page,
94 'has_more': False,
95 'sites': {},
96 }
98 # Create semaphore for concurrency control
99 semaphore = asyncio.Semaphore(self.MAX_CONCURRENT)
101 # Search all sources concurrently with primary browser profile
102 # If all sources fail, we try fallback profiles
103 primary_profile = BROWSER_PROFILES[0]
105 async with AsyncSession(impersonate=primary_profile) as session:
106 tasks = [
107 self._search_source(session, semaphore, source, query)
108 for source in enabled_sources
109 ]
110 results_by_source = await asyncio.gather(*tasks, return_exceptions=True)
112 # Aggregate results
113 all_results: list[SearchResult] = []
114 site_counts: dict[str, int] = {}
116 for source, result in zip(enabled_sources, results_by_source):
117 if isinstance(result, Exception):
118 logger.warning(f"Search failed for {source.host}: {result}")
119 await self._record_failure(source)
120 continue
122 site_counts[source.host] = len(result)
123 all_results.extend(result)
124 await self._record_success(source)
126 # Deduplicate by URL
127 seen_urls = set()
128 unique_results = []
129 for r in all_results:
130 if r.url not in seen_urls:
131 seen_urls.add(r.url)
132 unique_results.append(r)
134 # Convert to dict format for ranking
135 result_dicts = [
136 {
137 'url': r.url,
138 'title': r.title,
139 'host': r.host,
140 'image_url': r.image_url,
141 'description': r.description,
142 'rating_count': r.rating_count,
143 }
144 for r in unique_results
145 ]
147 # Apply AI ranking (optional, skips if unavailable)
148 result_dicts = await self._apply_ai_ranking(query, result_dicts)
150 # Paginate
151 total = len(result_dicts)
152 start = (page - 1) * per_page
153 end = start + per_page
154 paginated = result_dicts[start:end]
156 return {
157 'results': paginated,
158 'total': total,
159 'page': page,
160 'has_more': end < total,
161 'sites': site_counts,
162 }
164 async def _apply_ai_ranking(self, query: str, results: list[dict]) -> list[dict]:
165 """Apply AI ranking to search results (non-blocking).
167 Skips ranking if AI is unavailable or if it fails.
168 """
169 try:
170 from apps.ai.services.ranking import rank_results
171 ranked = await sync_to_async(rank_results)(query, results)
172 return ranked
173 except Exception as e:
174 logger.warning(f'AI ranking failed: {e}')
175 return results
177 async def _search_source(
178 self,
179 session: AsyncSession,
180 semaphore: asyncio.Semaphore,
181 source,
182 query: str,
183 ) -> list[SearchResult]:
184 """
185 Search a single source for recipes.
187 Uses randomized delays to avoid bot detection patterns.
188 """
189 async with semaphore:
190 # Add randomized delay to avoid predictable request patterns
191 await asyncio.sleep(get_random_delay())
192 # Build search URL
193 search_url = source.search_url_template.replace(
194 '{query}',
195 quote_plus(query)
196 )
198 try:
199 response = await asyncio.wait_for(
200 session.get(
201 search_url,
202 timeout=self.timeout,
203 allow_redirects=True,
204 ),
205 timeout=self.timeout + 5, # Extra buffer for asyncio
206 )
208 if response.status_code != 200:
209 raise Exception(f"HTTP {response.status_code}")
211 return self._parse_search_results(
212 response.text,
213 source.host,
214 source.result_selector,
215 search_url,
216 )
218 except asyncio.TimeoutError:
219 raise Exception("Request timed out")
221 def _parse_search_results(
222 self,
223 html: str,
224 host: str,
225 selector: str,
226 base_url: str,
227 ) -> list[SearchResult]:
228 """
229 Parse search results from HTML.
231 Uses the site-specific CSS selector if available,
232 otherwise falls back to common patterns.
233 """
234 soup = BeautifulSoup(html, 'html.parser')
235 results = []
237 # Try site-specific selector first
238 if selector:
239 elements = soup.select(selector)
240 if elements:
241 for el in elements[:20]: # Limit per site
242 result = self._extract_result_from_element(el, host, base_url)
243 if result:
244 results.append(result)
245 return results
247 # Fallback: Look for common recipe link patterns
248 results = self._fallback_parse(soup, host, base_url)
249 return results[:20] # Limit per site
251 def _extract_result_from_element(
252 self,
253 element,
254 host: str,
255 base_url: str,
256 ) -> Optional[SearchResult]:
257 """
258 Extract search result data from an HTML element.
259 """
260 # Find the link
261 link = element.find('a', href=True)
262 if not link:
263 link = element if element.name == 'a' and element.get('href') else None
264 if not link:
265 return None
267 url = link.get('href', '')
268 if not url:
269 return None
271 # Make URL absolute
272 url = urljoin(base_url, url)
274 # Skip non-recipe URLs
275 if not self._looks_like_recipe_url(url, host):
276 return None
278 # Extract title
279 title = ''
280 title_el = element.find(['h2', 'h3', 'h4', '.title', '[class*="title"]'])
281 if title_el:
282 title = title_el.get_text(strip=True)
283 if not title:
284 title = link.get_text(strip=True)
285 if not title:
286 title = link.get('title', '') or link.get('aria-label', '')
288 if not title:
289 return None
291 # Extract and strip rating count from title (e.g., "Recipe Name1,392Ratings")
292 rating_count = None
293 rating_match = re.search(r'([\d,]+)\s*[Rr]atings?\s*$', title)
294 if rating_match:
295 # Extract the number and remove commas
296 rating_str = rating_match.group(1).replace(',', '')
297 try:
298 rating_count = int(rating_str)
299 # Remove rating text from title
300 title = title[:rating_match.start()].strip()
301 except ValueError:
302 pass
304 # Title may have become empty after stripping rating (QA-053)
305 if not title:
306 return None
308 # Extract image
309 image_url = ''
310 img = element.find('img')
311 if img:
312 image_url = img.get('src') or img.get('data-src') or img.get('data-lazy-src', '')
313 if image_url:
314 image_url = urljoin(base_url, image_url)
316 # Extract description
317 description = ''
318 desc_el = element.find(['p', '.description', '[class*="description"]'])
319 if desc_el:
320 description = desc_el.get_text(strip=True)[:200]
322 return SearchResult(
323 url=url,
324 title=title[:200],
325 host=host,
326 image_url=image_url,
327 description=description,
328 rating_count=rating_count,
329 )
331 def _fallback_parse(
332 self,
333 soup: BeautifulSoup,
334 host: str,
335 base_url: str,
336 ) -> list[SearchResult]:
337 """
338 Fallback parser for sites without a specific selector.
340 Looks for common patterns in recipe search results.
341 """
342 results = []
344 # Strategy 1: Look for article elements with links
345 for article in soup.find_all('article')[:30]:
346 result = self._extract_result_from_element(article, host, base_url)
347 if result:
348 results.append(result)
350 if results:
351 return results
353 # Strategy 2: Look for card-like divs
354 card_selectors = [
355 '[class*="recipe-card"]',
356 '[class*="card"]',
357 '[class*="result"]',
358 '[class*="item"]',
359 ]
360 for selector in card_selectors:
361 for card in soup.select(selector)[:30]:
362 result = self._extract_result_from_element(card, host, base_url)
363 if result:
364 results.append(result)
365 if results:
366 return results
368 # Strategy 3: Look for links that look like recipe URLs
369 for link in soup.find_all('a', href=True)[:100]:
370 url = urljoin(base_url, link.get('href', ''))
371 if self._looks_like_recipe_url(url, host):
372 title = link.get_text(strip=True)
373 if title and len(title) > 5:
374 results.append(SearchResult(
375 url=url,
376 title=title[:200],
377 host=host,
378 ))
380 return results
382 def _looks_like_recipe_url(self, url: str, host: str) -> bool:
383 """
384 Check if a URL looks like a recipe detail page.
385 """
386 parsed = urlparse(url)
388 # Must be from the expected host
389 if host not in parsed.netloc:
390 return False
392 path = parsed.path.lower()
394 # Common recipe URL patterns
395 recipe_patterns = [
396 r'/recipe[s]?/',
397 r'/dish/',
398 r'/food/',
399 r'/cooking/',
400 r'/\d+/', # Numeric ID in path
401 r'-recipe/?$', # URL ending with -recipe
402 r'/a\d+/', # Alphanumeric IDs like /a69912280/
403 r'/food-cooking/', # Pioneer Woman style
404 ]
406 # Exclude non-recipe paths
407 exclude_patterns = [
408 r'/search',
409 r'/tag/',
410 r'/category/',
411 r'/author/',
412 r'/profile/',
413 r'/user/',
414 r'/about',
415 r'/contact',
416 r'/privacy',
417 r'/terms',
418 r'/newsletter',
419 r'/subscribe',
420 # Article/blog paths (QA-053)
421 r'/article/',
422 r'/articles/',
423 r'/blog/',
424 r'/post/',
425 r'/posts/',
426 r'/news/',
427 r'/story/',
428 r'/stories/',
429 r'/feature/',
430 r'/features/',
431 r'/guide/',
432 r'/guides/',
433 r'/review/',
434 r'/reviews/',
435 r'/roundup/',
436 r'/list/',
437 r'/listicle/',
438 # Video paths (QA-053)
439 r'/video/',
440 r'/videos/',
441 r'/watch/',
442 r'/watch\?',
443 r'/embed/',
444 r'/player/',
445 r'/clip/',
446 r'/clips/',
447 r'/episode/',
448 r'/episodes/',
449 r'/series/',
450 r'/show/',
451 r'/shows/',
452 r'/gallery/',
453 r'/galleries/',
454 r'/slideshow/',
455 r'/photo-gallery/',
456 # Index/listing pages (QA-053)
457 r'/seasons?(?:/|$)',
458 r'/cuisines?(?:/|$)',
459 r'/ingredients?(?:/|$)',
460 r'/collections?(?:/|$)',
461 r'/occasions?(?:/|$)',
462 r'/courses?(?:/|$)',
463 r'/diets?(?:/|$)',
464 r'/techniques?(?:/|$)',
465 r'/chefs?(?:/|$)',
466 r'/dishes(?:/|$)',
467 r'/menus?(?:/|$)',
468 r'/meal-plans?(?:/|$)',
469 ]
471 for pattern in exclude_patterns:
472 if re.search(pattern, path):
473 return False
475 # Site-specific requirements (QA-058)
476 # AllRecipes has article pages at root that look like recipes but aren't
477 # Real recipes are always under /recipe/ path
478 if 'allrecipes.com' in host and '/recipe/' not in path:
479 return False
481 # Check for recipe patterns
482 for pattern in recipe_patterns:
483 if re.search(pattern, path):
484 return True
486 # Heuristic: URL path has enough segments and isn't too short
487 segments = [s for s in path.split('/') if s]
488 if len(segments) >= 2 and len(path) > 20:
489 return True
491 # Also accept single-segment slug-style URLs (common for food blogs)
492 # e.g., /30-cloves-garlic-chicken/
493 if len(segments) == 1 and len(path) > 15 and path.count('-') >= 2:
494 return True
496 return False
498 async def _record_failure(self, source) -> None:
499 """Record a search failure for maintenance tracking."""
500 from apps.recipes.models import SearchSource
502 @sync_to_async
503 def update():
504 source.consecutive_failures += 1
505 if source.consecutive_failures >= 3:
506 source.needs_attention = True
507 source.save(update_fields=['consecutive_failures', 'needs_attention'])
509 await update()
511 async def _record_success(self, source) -> None:
512 """Record a successful search."""
513 from apps.recipes.models import SearchSource
515 @sync_to_async
516 def update():
517 source.consecutive_failures = 0
518 source.needs_attention = False
519 source.last_validated_at = timezone.now()
520 source.save(update_fields=[
521 'consecutive_failures',
522 'needs_attention',
523 'last_validated_at',
524 ])
526 await update()