Skip to content

Commit 96dd84a

Browse files
committed
For short strings, it is more accurate to use all ngrams rather than a sample
1 parent a1598f1 commit 96dd84a

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

langdetect/detector.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,17 @@ def _detect_block(self):
149149
if not ngrams:
150150
raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')
151151

152+
if len(ngrams) < self.ITERATION_LIMIT / 3:
153+
# More accurate to take them all
154+
prob = self._init_probability()
155+
for i, ngram in enumerate(ngrams):
156+
self._update_lang_prob(prob, ngram, self.alpha)
157+
if i % 5 == 0:
158+
self._normalize_prob(prob)
159+
self._normalize_prob(prob)
160+
self.langprob = prob
161+
return
162+
152163
self.langprob = [0.0] * len(self.langlist)
153164

154165
self.random.seed(self.seed)

0 commit comments

Comments
 (0)