Skip to content

Commit

Permalink
Fix chinese characters be recognized as email - first commit (#3185)
Browse files Browse the repository at this point in the history
Co-authored-by: Michael Wang (Centific Technologies Inc) <[email protected]>
  • Loading branch information
MichaelMWW and Michael Wang (Centific Technologies Inc) authored Nov 26, 2024
1 parent 66901d1 commit 25afa66
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .NET/Microsoft.Recognizers.Definitions.Common/BaseEmail.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public static class BaseEmail
public const string EmailRegex = @"(([-a-z0-9_\+\.]+)@([-a-z\d\.]+)\.([a-z\.]{2,6}))";
public const string IPv4Regex = @"(?<ipv4>(\d{1,3}\.){3}\d{1,3})";
public const string NormalSuffixRegex = @"(([0-9a-z][-]*[0-9a-z]*\.)+(?<tld>[a-z][\-a-z]{0,22}[a-z]))";
public const string EmailPrefix = @"(?("")("".+?(?<!\\)"")|(([0-9a-z]((\.(?!\.))|[-!#\$%&'\*\+/=\?\^\{\}\|~\w])*)(?<=[0-9a-z])))";
public const string EmailPrefix = @"(?("")("".+?(?<!\\)"")|(([0-9a-z]((\.(?!\.))|([-!#\$%&'\*\+/=\?\^\{\}\|~]|[a-zA-Z0-9_]))*)(?<=[0-9a-z])))";
public static readonly string EmailSuffix = $@"(?(\[)(\[{IPv4Regex}\])|{NormalSuffixRegex})";
public static readonly string EmailRegex2 = $@"(({EmailPrefix})@({EmailSuffix}))";
public const string RFC5322Regex = @"\A(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|""(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*"")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])\z";
Expand Down
2 changes: 1 addition & 1 deletion Patterns/Base-Email.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ IPv4Regex: !simpleRegex
NormalSuffixRegex: !simpleRegex
def: (([0-9a-z][-]*[0-9a-z]*\.)+(?<tld>[a-z][\-a-z]{0,22}[a-z]))
EmailPrefix: !simpleRegex
def: (?("")("".+?(?<!\\)"")|(([0-9a-z]((\.(?!\.))|[-!#\$%&'\*\+/=\?\^\{\}\|~\w])*)(?<=[0-9a-z])))
def: (?("")("".+?(?<!\\)"")|(([0-9a-z]((\.(?!\.))|([-!#\$%&'\*\+/=\?\^\{\}\|~]|[a-zA-Z0-9_]))*)(?<=[0-9a-z])))
EmailSuffix: !nestedRegex
def: (?(\[)(\[{IPv4Regex}\])|{NormalSuffixRegex})
references: [ IPv4Regex, NormalSuffixRegex ]
Expand Down
16 changes: 14 additions & 2 deletions Specs/Sequence/English/EmailModel.json
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,18 @@
}
]
},
{
"Input": "a邮箱地址[email protected]",
"Results": [
{
"Text": "[email protected]",
"TypeName": "email",
"Resolution": {
"value": "[email protected]"
}
}
]
},
{
"Input": "Hello, @Carol, please write to me at [email protected] for more information on task #A1",
"Results": [
Expand Down Expand Up @@ -142,13 +154,13 @@
},
{
"Input": "Both [email protected] and [email protected] are not valid e-mail addresses.",
"Comment": "By default the current system is strict. If a relaxed match is needed (to catch these), enable the Relaxed option.",
"Comment": "By default the current system is strict. If a relaxed match is needed (to catch these), enable the Relaxed option.",
"NotSupportedByDesign": "javascript, python",
"Results": []
},
{
"Input": "Periods at the end of addresses can be ambiguous. Contact [email protected].",
"Comment": "By default the current system is strict. If a relaxed match is needed (to catch the period), enable the Relaxed option.",
"Comment": "By default the current system is strict. If a relaxed match is needed (to catch the period), enable the Relaxed option.",
"NotSupportedByDesign": "javascript, python, java",
"Results": [
{
Expand Down

0 comments on commit 25afa66

Please sign in to comment.