Coverage for apps / recipes / services / search_classifiers.py: 100%
61 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 13:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-02 13:22 +0000
1"""URL- and title-level classifiers used by the recipe search parser.
3Extracted from `search_parsers.py` to keep that file under the per-file size
4budget. The classifiers decide whether a candidate result represents a recipe
5detail page rather than an article, gallery, or category index.
6"""
8import re
9from typing import Optional
10from urllib.parse import urlparse
13# Compiled patterns for looks_like_recipe_url (avoid recompiling per call)
14_RECIPE_PATTERNS = [
15 re.compile(p)
16 for p in [
17 r"/recipe[s]?/",
18 r"/dish/",
19 r"/food/",
20 r"/cooking/",
21 r"/\d+/",
22 r"-recipe/?$",
23 r"/a\d+/",
24 r"/food-cooking/",
25 ]
26]
28_EXCLUDE_PATTERNS = [
29 re.compile(p)
30 for p in [
31 r"/search",
32 r"/tag/",
33 r"/category/",
34 r"/author/",
35 r"/profile/",
36 r"/user/",
37 r"/about",
38 r"/contact",
39 r"/privacy",
40 r"/terms",
41 r"/newsletter",
42 r"/subscribe",
43 # Article/blog paths (QA-053)
44 r"/article/",
45 r"/articles/",
46 r"/blog/",
47 r"/post/",
48 r"/posts/",
49 r"/news/",
50 r"/story/",
51 r"/stories/",
52 r"/feature/",
53 r"/features/",
54 r"/guide/",
55 r"/guides/",
56 r"/review/",
57 r"/reviews/",
58 r"/roundup/",
59 r"/list/",
60 r"/listicle/",
61 # Video paths (QA-053)
62 r"/video/",
63 r"/videos/",
64 r"/watch/",
65 r"/watch\?",
66 r"/embed/",
67 r"/player/",
68 r"/clip/",
69 r"/clips/",
70 r"/episode/",
71 r"/episodes/",
72 r"/series/",
73 r"/show/",
74 r"/shows/",
75 r"/gallery/",
76 r"/galleries/",
77 r"/slideshow/",
78 r"/photo-gallery/",
79 # Index/listing pages (QA-053)
80 r"/seasons?(?:/|$)",
81 r"/cuisines?(?:/|$)",
82 r"/ingredients?(?:/|$)",
83 r"/collections?(?:/|$)",
84 r"/occasions?(?:/|$)",
85 r"/courses?(?:/|$)",
86 r"/diets?(?:/|$)",
87 r"/techniques?(?:/|$)",
88 r"/chefs?(?:/|$)",
89 r"/dishes(?:/|$)",
90 r"/menus?(?:/|$)",
91 r"/meal-plans?(?:/|$)",
92 ]
93]
96def _check_exclusion_patterns(path: str) -> bool:
97 """Return True if path matches any exclusion pattern."""
98 return any(pattern.search(path) for pattern in _EXCLUDE_PATTERNS)
101def _check_recipe_patterns(path: str) -> bool:
102 """Return True if path matches any recipe pattern."""
103 return any(pattern.search(path) for pattern in _RECIPE_PATTERNS)
106def _skinnytaste_signal(path: str) -> Optional[str]:
107 """Skinnytaste publishes recipes at /<slug>/ with no description on the
108 search page, which the generic neutral-URL filter would reject. Treat any
109 top-level hyphenated slug as a strong recipe signal; exclusion patterns
110 above already strip /about/, /category/, etc.
111 """
112 segments = [s for s in path.split("/") if s]
113 if len(segments) == 1 and len(segments[0]) > 15 and segments[0].count("-") >= 2:
114 return "strong_include"
115 return None
118# Site-specific rules: host → callable returning signal or None
119_SITE_RULES: dict[str, callable] = {
120 "allrecipes.com": lambda path: "reject" if "/recipe/" not in path else None,
121 "skinnytaste.com": _skinnytaste_signal,
122}
125def _check_site_rules(host: str, path: str) -> Optional[str]:
126 """Apply site-specific rules. Returns a signal string or None."""
127 for domain, rule in _SITE_RULES.items():
128 if host == domain or host.endswith(f".{domain}"):
129 return rule(path)
130 return None
133def _check_path_heuristics(path: str) -> str:
134 """Apply heuristic fallbacks for paths with no strong signal.
136 Returns "neutral" or "reject".
137 """
138 segments = [s for s in path.split("/") if s]
139 if len(segments) >= 2 and len(path) > 20:
140 return "neutral"
141 if len(segments) == 1 and len(path) > 15 and path.count("-") >= 2:
142 return "neutral"
143 return "reject"
146def get_url_signal(url: str, host: str) -> str:
147 """Determine URL signal strength for recipe filtering.
149 Returns:
150 "strong_exclude" - URL matches exclusion patterns (articles, blogs, etc.)
151 "strong_include" - URL matches recipe patterns (/recipe/, /recipes/, etc.)
152 "neutral" - URL passes heuristics but has no strong signal
153 "reject" - URL fails all checks (wrong host, too short, etc.)
154 """
155 parsed = urlparse(url)
157 if host not in parsed.netloc:
158 return "reject"
160 path = parsed.path.lower()
162 if _check_exclusion_patterns(path):
163 return "strong_exclude"
165 site_signal = _check_site_rules(host, path)
166 if site_signal is not None:
167 return site_signal
169 if _check_recipe_patterns(path):
170 return "strong_include"
172 return _check_path_heuristics(path)
175def looks_like_recipe_url(url: str, host: str) -> bool:
176 """Check if a URL looks like a recipe detail page."""
177 signal = get_url_signal(url, host)
178 return signal in ("strong_include", "neutral")
181# Strong editorial patterns — always reject even if recipe words present
182# These are clearly article/editorial headlines, not recipe titles
183_STRONG_EDITORIAL_PATTERNS = [
184 re.compile(p, re.IGNORECASE)
185 for p in [
186 r"\bdeserves?\s+a\s+(?:gold|silver|bronze)\s+medal\b",
187 r"\bis\s+a\s+weeknight\s+winner\b",
188 r"\btop\s+trending\s+recipe\s+of\s+\d{4}\b",
189 r"\binsanely\s+awesome\b",
190 r"\bmost\s+beautiful\s+destination\b",
191 r"\bbest\s+time\s+to\s+book\b",
192 ]
193]
195# Mild editorial patterns — rejected unless recipe-context words present
196_EDITORIAL_TITLE_PATTERNS = [
197 re.compile(p, re.IGNORECASE)
198 for p in [
199 # Listicles: "Top 10...", "5 Best...", "7 Reasons..."
200 r"^(?:the\s+)?(?:top\s+)?\d+\s+(?:best|worst|things|reasons|ways|places|tips|tricks|destinations|restaurants|spots|cities|towns)\b",
201 # Travel/destination content
202 r"\btravel\s+guide\b",
203 r"\bbest\s+destinations?\b",
204 r"\bplaces?\s+to\s+visit\b",
205 r"\bwhere\s+to\s+(?:eat|go|stay|travel)\b",
206 r"\bbook\s+(?:your\s+)?(?:thanksgiving|christmas|holiday)\s+travel\b",
207 # Review/editorial
208 r"^review\s*:",
209 r"\b(?:product|book|restaurant|movie|hotel|app)\s+review\b",
210 # News/trending headers
211 r"^(?:news|breaking|update|trending)\s*:",
212 # Meta/navigation pages
213 r"^(?:about\s+us|contact\s+us|privacy\s+policy|terms\s+of|cookie\s+policy|subscribe|newsletter|sign\s+up|log\s+in)\b",
214 ]
215]
217# Recipe-context words that override mild editorial title patterns
218_RECIPE_CONTEXT_PATTERN = re.compile(
219 r"\b(?:recipe[s]?|cook(?:ing|ed)?|bake[ds]?|baking|roast(?:ed|ing)?|"
220 r"grill(?:ed|ing)?|how\s+to\s+(?:make|cook|bake|prepare)|homemade|"
221 r"ingredient[s]?|from\s+scratch|step.by.step|easy\s+(?:to\s+)?make)\b",
222 re.IGNORECASE,
223)
226def looks_like_recipe_title(title: str, url_signal: str) -> bool:
227 """Check if a search result title looks like recipe content.
229 Uses tiered resolution with URL signal strength:
230 - strong_include URLs: always pass (recipe URL overrides title concerns)
231 - neutral URLs: evaluated by title patterns
232 - Strong editorial patterns always reject (even with recipe words)
233 - Mild editorial patterns rejected unless recipe-context words present
234 """
235 if url_signal == "strong_include":
236 return True
238 title_stripped = title.strip()
239 if not title_stripped:
240 return False
242 # Strong editorial patterns always reject
243 for pattern in _STRONG_EDITORIAL_PATTERNS:
244 if pattern.search(title_stripped):
245 return False
247 # Mild editorial patterns rejected unless recipe-context words present
248 for pattern in _EDITORIAL_TITLE_PATTERNS:
249 if pattern.search(title_stripped):
250 if _RECIPE_CONTEXT_PATTERN.search(title_stripped):
251 return True
252 return False
254 return True