@@ -46,7 +46,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
4646 "smtputf8" : False ,
4747 }
4848
49- # RFC 5321 4.5.3.1.1
49+ # Check the length of the local part by couting characters.
50+ # (RFC 5321 4.5.3.1.1)
5051 # We're checking the number of characters here. If the local part
5152 # is ASCII-only, then that's the same as bytes (octets). If it's
5253 # internationalized, then the UTF-8 encoding may be longer, but
@@ -57,6 +58,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
5758 raise EmailSyntaxError ("The email address is too long before the @-sign {}." .format (reason ))
5859
5960 # Check for invalid characters.
61+ # (RFC 2822 Section 3.2.4 / RFC 5322 Section 3.2.3, plus RFC 6531 section 3.3
62+ # if internationalized local parts are allowed)
6063 atext_re = re .compile ('[.' + (ATEXT if not allow_smtputf8 else ATEXT_INTL ) + ']' )
6164 bad_chars = set (
6265 safe_character_display (c )
@@ -67,9 +70,11 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
6770 raise EmailSyntaxError ("The email address contains invalid characters before the @-sign: " + ", " .join (sorted (bad_chars )) + "." )
6871
6972 # Check for dot errors imposted by the dot-atom rule.
73+ # (RFC 2822 3.2.4)
7074 check_dot_atom (local , 'An email address cannot start with a {}.' , 'An email address cannot have a {} immediately before the @-sign.' , is_hostname = False )
7175
72- # Check the local part against the regular expression for the older ASCII requirements.
76+ # Check the local part against the non-internationalized regular expression.
77+ # (RFC 2822 3.2.4)
7378 m = DOT_ATOM_TEXT .match (local )
7479 if m :
7580 # Return the local part unchanged and flag that SMTPUTF8 is not needed.
@@ -82,6 +87,7 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
8287 else :
8388 # The local part failed the ASCII check. Now try the extended internationalized requirements.
8489 # This should already be handled by the bad_chars and check_dot_atom tests above.
90+ # It's the same pattern but with additional characters permitted.
8591 m = DOT_ATOM_TEXT_INTL .match (local )
8692 if not m :
8793 raise EmailSyntaxError ("The email address contains invalid characters before the @-sign." )
@@ -97,7 +103,8 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
97103
98104 # Check for unsafe characters.
99105 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
100- # by DOT_ATOM_TEXT_INTL.
106+ # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
107+ # they may not be valid, safe, or sensible Unicode strings.
101108 check_unsafe_chars (local )
102109
103110 # Try encoding to UTF-8. Failure is possible with some characters like
@@ -117,39 +124,56 @@ def validate_email_local_part(local, allow_smtputf8=True, allow_empty_local=Fals
117124
118125
119126def check_unsafe_chars (s ):
127+ # Check for unsafe characters or characters that would make the string
128+ # invalid or non-sensible Unicode.
120129 bad_chars = set ()
121130 for i , c in enumerate (s ):
122131 category = unicodedata .category (c )
123132 if category [0 ] in ("L" , "N" , "P" , "S" ):
124- # letters , numbers, punctuation, and symbols are permitted
133+ # Letters , numbers, punctuation, and symbols are permitted.
125134 pass
126135 elif category [0 ] == "M" :
127- # combining character in first position would combine with something
128- # outside of the email address if concatenated to the right, but are
129- # otherwise permitted
136+ # Combining character in first position would combine with something
137+ # outside of the email address if concatenated, so they are not safe.
138+ # We also check if this occurs after the @-sign, which would not be
139+ # sensible.
130140 if i == 0 :
131141 bad_chars .add (c )
132- elif category [0 ] in ("Z" , "C" ):
133- # spaces and line/paragraph characters (Z) and
134- # control, format, surrogate, private use, and unassigned code points (C)
142+ elif category [0 ] == "Z" :
143+ # Spaces and line/paragraph characters (Z) outside of the ASCII range
144+ # are not specifically disallowed as far as I can tell, but they
145+ # violate the spirit of the non-internationalized specification that
146+ # email addresses do not contain spaces or line breaks when not quoted.
147+ bad_chars .add (c )
148+ elif category [0 ] == "C" :
149+ # Control, format, surrogate, private use, and unassigned code points (C)
150+ # are all unsafe in various ways. Control and format characters can affect
151+ # text rendering if the email address is concatenated with other text.
152+ # Bidirectional format characters are unsafe, even if used properly, because
153+ # they cause an email address to render as a different email address.
154+ # Private use characters do not make sense for publicly deliverable
155+ # email addresses.
135156 bad_chars .add (c )
136157 else :
137158 # All categories should be handled above, but in case there is something new
138- # in the future.
159+ # to the Unicode specification in the future, reject all other categories .
139160 bad_chars .add (c )
140161 if bad_chars :
141162 raise EmailSyntaxError ("The email address contains unsafe characters: "
142163 + ", " .join (safe_character_display (c ) for c in sorted (bad_chars )) + "." )
143164
144165
145166def check_dot_atom (label , start_descr , end_descr , is_hostname ):
167+ # RFC 2822 3.2.4
146168 if label .endswith ("." ):
147169 raise EmailSyntaxError (end_descr .format ("period" ))
148170 if label .startswith ("." ):
149171 raise EmailSyntaxError (start_descr .format ("period" ))
150172 if ".." in label :
151173 raise EmailSyntaxError ("An email address cannot have two periods in a row." )
174+
152175 if is_hostname :
176+ # RFC 952
153177 if label .endswith ("-" ):
154178 raise EmailSyntaxError (end_descr .format ("hyphen" ))
155179 if label .startswith ("-" ):
@@ -166,13 +190,19 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
166190 raise EmailSyntaxError ("There must be something after the @-sign." )
167191
168192 # Check for invalid characters before normalization.
193+ # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)
169194 bad_chars = set (
170195 safe_character_display (c )
171196 for c in domain
172197 if not ATEXT_HOSTNAME_INTL .match (c )
173198 )
174199 if bad_chars :
175200 raise EmailSyntaxError ("The part after the @-sign contains invalid characters: " + ", " .join (sorted (bad_chars )) + "." )
201+
202+ # Check for unsafe characters.
203+ # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
204+ # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
205+ # they may not be valid, safe, or sensible Unicode strings.
176206 check_unsafe_chars (domain )
177207
178208 # Perform UTS-46 normalization, which includes casefolding, NFC normalization,
@@ -191,9 +221,13 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
191221 # Check that before we do IDNA encoding because the IDNA library gives
192222 # unfriendly errors for these cases, but after UTS-46 normalization because
193223 # it can insert periods and hyphens (from fullwidth characters).
224+ # (RFC 952, RFC 2822 3.2.4)
194225 check_dot_atom (domain , 'An email address cannot have a {} immediately after the @-sign.' , 'An email address cannot end with a {}.' , is_hostname = True )
226+
227+ # Check for RFC 5890's invalid R-LDH labels, which are labels that start
228+ # with two characters other than "xn" and two dashes.
195229 for label in domain .split ("." ):
196- if re .match (r"(?!xn)..--" , label , re .I ): # RFC 5890 invalid R-LDH labels
230+ if re .match (r"(?!xn)..--" , label , re .I ):
197231 raise EmailSyntaxError ("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode." )
198232
199233 if DOT_ATOM_TEXT_HOSTNAME .match (domain ):
@@ -230,23 +264,29 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
230264 if not m :
231265 raise EmailSyntaxError ("The email address contains invalid characters after the @-sign after IDNA encoding." )
232266
233- # RFC 5321 4.5.3.1.2
234- # We're checking the number of bytes (octets) here, which can be much
267+ # Check the length of the domain name in bytes.
268+ # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)
269+ # We're checking the number of bytes ("octets") here, which can be much
235270 # higher than the number of characters in internationalized domains,
236271 # on the assumption that the domain may be transmitted without SMTPUTF8
237272 # as IDNA ASCII. (This is also checked by idna.encode, so this exception
238273 # is never reached for internationalized domains.)
239274 if len (ascii_domain ) > DOMAIN_MAX_LENGTH :
240275 reason = get_length_reason (ascii_domain , limit = DOMAIN_MAX_LENGTH )
241276 raise EmailSyntaxError ("The email address is too long after the @-sign {}." .format (reason ))
277+
278+ # Also check the label length limit.
279+ # (RFC 1035 2.3.1)
242280 for label in ascii_domain .split ("." ):
243281 if len (label ) > DNS_LABEL_LENGTH_LIMIT :
244282 reason = get_length_reason (label , limit = DNS_LABEL_LENGTH_LIMIT )
245- raise EmailSyntaxError ("On either side of the @-sign, periods cannot be separated by so many characters {}." .format (reason ))
283+ raise EmailSyntaxError ("After the @-sign, periods cannot be separated by so many characters {}." .format (reason ))
246284
247285 if globally_deliverable :
248286 # All publicly deliverable addresses have domain named with at least
249- # one period, and we'll consider the lack of a period a syntax error
287+ # one period, at least for gTLDs created since 2013 (per the ICANN Board
288+ # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
289+ # We'll consider the lack of a period a syntax error
250290 # since that will match people's sense of what an email address looks
251291 # like. We'll skip this in test environments to allow '@test' email
252292 # addresses.
@@ -260,6 +300,7 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
260300 # Check special-use and reserved domain names.
261301 # Some might fail DNS-based deliverability checks, but that
262302 # can be turned off, so we should fail them all sooner.
303+ # See the references in __init__.py.
263304 from . import SPECIAL_USE_DOMAIN_NAMES
264305 for d in SPECIAL_USE_DOMAIN_NAMES :
265306 # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.
@@ -274,15 +315,14 @@ def validate_email_domain_part(domain, test_environment=False, globally_delivera
274315 # but not be actual IDNA. For ASCII-only domains, the conversion out
275316 # of IDNA just gives the same thing back.
276317 #
277- # This gives us the canonical internationalized form of the domain,
278- # which we should use in all error messages.
318+ # This gives us the canonical internationalized form of the domain.
279319 try :
280320 domain_i18n = idna .decode (ascii_domain .encode ('ascii' ))
281321 except idna .IDNAError as e :
282322 raise EmailSyntaxError ("The part after the @-sign is not valid IDNA ({})." .format (str (e )))
283323
284324 # Check for invalid characters after normalization. These
285- # should never arise.
325+ # should never arise. See the similar checks above.
286326 bad_chars = set (
287327 safe_character_display (c )
288328 for c in domain
0 commit comments