Coverage for apps / recipes / services / search_classifiers.py: 100%

61 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-02 13:22 +0000

1"""URL- and title-level classifiers used by the recipe search parser. 

2 

3Extracted from `search_parsers.py` to keep that file under the per-file size 

4budget. The classifiers decide whether a candidate result represents a recipe 

5detail page rather than an article, gallery, or category index. 

6""" 

7 

8import re 

9from typing import Optional 

10from urllib.parse import urlparse 

11 

12 

13# Compiled patterns for looks_like_recipe_url (avoid recompiling per call) 

14_RECIPE_PATTERNS = [ 

15 re.compile(p) 

16 for p in [ 

17 r"/recipe[s]?/", 

18 r"/dish/", 

19 r"/food/", 

20 r"/cooking/", 

21 r"/\d+/", 

22 r"-recipe/?$", 

23 r"/a\d+/", 

24 r"/food-cooking/", 

25 ] 

26] 

27 

28_EXCLUDE_PATTERNS = [ 

29 re.compile(p) 

30 for p in [ 

31 r"/search", 

32 r"/tag/", 

33 r"/category/", 

34 r"/author/", 

35 r"/profile/", 

36 r"/user/", 

37 r"/about", 

38 r"/contact", 

39 r"/privacy", 

40 r"/terms", 

41 r"/newsletter", 

42 r"/subscribe", 

43 # Article/blog paths (QA-053) 

44 r"/article/", 

45 r"/articles/", 

46 r"/blog/", 

47 r"/post/", 

48 r"/posts/", 

49 r"/news/", 

50 r"/story/", 

51 r"/stories/", 

52 r"/feature/", 

53 r"/features/", 

54 r"/guide/", 

55 r"/guides/", 

56 r"/review/", 

57 r"/reviews/", 

58 r"/roundup/", 

59 r"/list/", 

60 r"/listicle/", 

61 # Video paths (QA-053) 

62 r"/video/", 

63 r"/videos/", 

64 r"/watch/", 

65 r"/watch\?", 

66 r"/embed/", 

67 r"/player/", 

68 r"/clip/", 

69 r"/clips/", 

70 r"/episode/", 

71 r"/episodes/", 

72 r"/series/", 

73 r"/show/", 

74 r"/shows/", 

75 r"/gallery/", 

76 r"/galleries/", 

77 r"/slideshow/", 

78 r"/photo-gallery/", 

79 # Index/listing pages (QA-053) 

80 r"/seasons?(?:/|$)", 

81 r"/cuisines?(?:/|$)", 

82 r"/ingredients?(?:/|$)", 

83 r"/collections?(?:/|$)", 

84 r"/occasions?(?:/|$)", 

85 r"/courses?(?:/|$)", 

86 r"/diets?(?:/|$)", 

87 r"/techniques?(?:/|$)", 

88 r"/chefs?(?:/|$)", 

89 r"/dishes(?:/|$)", 

90 r"/menus?(?:/|$)", 

91 r"/meal-plans?(?:/|$)", 

92 ] 

93] 

94 

95 

96def _check_exclusion_patterns(path: str) -> bool: 

97 """Return True if path matches any exclusion pattern.""" 

98 return any(pattern.search(path) for pattern in _EXCLUDE_PATTERNS) 

99 

100 

101def _check_recipe_patterns(path: str) -> bool: 

102 """Return True if path matches any recipe pattern.""" 

103 return any(pattern.search(path) for pattern in _RECIPE_PATTERNS) 

104 

105 

106def _skinnytaste_signal(path: str) -> Optional[str]: 

107 """Skinnytaste publishes recipes at /<slug>/ with no description on the 

108 search page, which the generic neutral-URL filter would reject. Treat any 

109 top-level hyphenated slug as a strong recipe signal; exclusion patterns 

110 above already strip /about/, /category/, etc. 

111 """ 

112 segments = [s for s in path.split("/") if s] 

113 if len(segments) == 1 and len(segments[0]) > 15 and segments[0].count("-") >= 2: 

114 return "strong_include" 

115 return None 

116 

117 

118# Site-specific rules: host → callable returning signal or None 

119_SITE_RULES: dict[str, callable] = { 

120 "allrecipes.com": lambda path: "reject" if "/recipe/" not in path else None, 

121 "skinnytaste.com": _skinnytaste_signal, 

122} 

123 

124 

125def _check_site_rules(host: str, path: str) -> Optional[str]: 

126 """Apply site-specific rules. Returns a signal string or None.""" 

127 for domain, rule in _SITE_RULES.items(): 

128 if host == domain or host.endswith(f".{domain}"): 

129 return rule(path) 

130 return None 

131 

132 

133def _check_path_heuristics(path: str) -> str: 

134 """Apply heuristic fallbacks for paths with no strong signal. 

135 

136 Returns "neutral" or "reject". 

137 """ 

138 segments = [s for s in path.split("/") if s] 

139 if len(segments) >= 2 and len(path) > 20: 

140 return "neutral" 

141 if len(segments) == 1 and len(path) > 15 and path.count("-") >= 2: 

142 return "neutral" 

143 return "reject" 

144 

145 

146def get_url_signal(url: str, host: str) -> str: 

147 """Determine URL signal strength for recipe filtering. 

148 

149 Returns: 

150 "strong_exclude" - URL matches exclusion patterns (articles, blogs, etc.) 

151 "strong_include" - URL matches recipe patterns (/recipe/, /recipes/, etc.) 

152 "neutral" - URL passes heuristics but has no strong signal 

153 "reject" - URL fails all checks (wrong host, too short, etc.) 

154 """ 

155 parsed = urlparse(url) 

156 

157 if host not in parsed.netloc: 

158 return "reject" 

159 

160 path = parsed.path.lower() 

161 

162 if _check_exclusion_patterns(path): 

163 return "strong_exclude" 

164 

165 site_signal = _check_site_rules(host, path) 

166 if site_signal is not None: 

167 return site_signal 

168 

169 if _check_recipe_patterns(path): 

170 return "strong_include" 

171 

172 return _check_path_heuristics(path) 

173 

174 

175def looks_like_recipe_url(url: str, host: str) -> bool: 

176 """Check if a URL looks like a recipe detail page.""" 

177 signal = get_url_signal(url, host) 

178 return signal in ("strong_include", "neutral") 

179 

180 

181# Strong editorial patterns — always reject even if recipe words present 

182# These are clearly article/editorial headlines, not recipe titles 

183_STRONG_EDITORIAL_PATTERNS = [ 

184 re.compile(p, re.IGNORECASE) 

185 for p in [ 

186 r"\bdeserves?\s+a\s+(?:gold|silver|bronze)\s+medal\b", 

187 r"\bis\s+a\s+weeknight\s+winner\b", 

188 r"\btop\s+trending\s+recipe\s+of\s+\d{4}\b", 

189 r"\binsanely\s+awesome\b", 

190 r"\bmost\s+beautiful\s+destination\b", 

191 r"\bbest\s+time\s+to\s+book\b", 

192 ] 

193] 

194 

195# Mild editorial patterns — rejected unless recipe-context words present 

196_EDITORIAL_TITLE_PATTERNS = [ 

197 re.compile(p, re.IGNORECASE) 

198 for p in [ 

199 # Listicles: "Top 10...", "5 Best...", "7 Reasons..." 

200 r"^(?:the\s+)?(?:top\s+)?\d+\s+(?:best|worst|things|reasons|ways|places|tips|tricks|destinations|restaurants|spots|cities|towns)\b", 

201 # Travel/destination content 

202 r"\btravel\s+guide\b", 

203 r"\bbest\s+destinations?\b", 

204 r"\bplaces?\s+to\s+visit\b", 

205 r"\bwhere\s+to\s+(?:eat|go|stay|travel)\b", 

206 r"\bbook\s+(?:your\s+)?(?:thanksgiving|christmas|holiday)\s+travel\b", 

207 # Review/editorial 

208 r"^review\s*:", 

209 r"\b(?:product|book|restaurant|movie|hotel|app)\s+review\b", 

210 # News/trending headers 

211 r"^(?:news|breaking|update|trending)\s*:", 

212 # Meta/navigation pages 

213 r"^(?:about\s+us|contact\s+us|privacy\s+policy|terms\s+of|cookie\s+policy|subscribe|newsletter|sign\s+up|log\s+in)\b", 

214 ] 

215] 

216 

217# Recipe-context words that override mild editorial title patterns 

218_RECIPE_CONTEXT_PATTERN = re.compile( 

219 r"\b(?:recipe[s]?|cook(?:ing|ed)?|bake[ds]?|baking|roast(?:ed|ing)?|" 

220 r"grill(?:ed|ing)?|how\s+to\s+(?:make|cook|bake|prepare)|homemade|" 

221 r"ingredient[s]?|from\s+scratch|step.by.step|easy\s+(?:to\s+)?make)\b", 

222 re.IGNORECASE, 

223) 

224 

225 

226def looks_like_recipe_title(title: str, url_signal: str) -> bool: 

227 """Check if a search result title looks like recipe content. 

228 

229 Uses tiered resolution with URL signal strength: 

230 - strong_include URLs: always pass (recipe URL overrides title concerns) 

231 - neutral URLs: evaluated by title patterns 

232 - Strong editorial patterns always reject (even with recipe words) 

233 - Mild editorial patterns rejected unless recipe-context words present 

234 """ 

235 if url_signal == "strong_include": 

236 return True 

237 

238 title_stripped = title.strip() 

239 if not title_stripped: 

240 return False 

241 

242 # Strong editorial patterns always reject 

243 for pattern in _STRONG_EDITORIAL_PATTERNS: 

244 if pattern.search(title_stripped): 

245 return False 

246 

247 # Mild editorial patterns rejected unless recipe-context words present 

248 for pattern in _EDITORIAL_TITLE_PATTERNS: 

249 if pattern.search(title_stripped): 

250 if _RECIPE_CONTEXT_PATTERN.search(title_stripped): 

251 return True 

252 return False 

253 

254 return True 

← Back to Dashboard