-
Notifications
You must be signed in to change notification settings - Fork 636
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DYN-6146 Removing PII data from a JSON workspace #14471
Merged
QilongTang
merged 6 commits into
DynamoDS:master
from
jesusalvino:DYN-6146-RemovePIIData
Nov 6, 2023
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
c24745f
Removing PII data from a JSON workspace
dfbeeb3
Refactoring and Adding a Unit Test
033ffc4
Restricting the access level
8bb6593
PII Detector refactored
26e0585
Merge branch 'master' into DYN-6146-RemovePIIData
QilongTang 73d5917
Updating the Unit Test
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
|
||
using Newtonsoft.Json.Linq; | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text.RegularExpressions; | ||
|
||
namespace Dynamo.Utilities | ||
{ | ||
/// <summary> | ||
/// Helper Class for removing PII Data from a JSON workspace | ||
/// </summary> | ||
internal static class PIIDetector | ||
{ | ||
const string Nodes = "Nodes"; | ||
const string InputValue = "InputValue"; | ||
const string HintPath = "HintPath"; | ||
const string Code = "Code"; | ||
const string View = "View"; | ||
const string Annotations = "Annotations"; | ||
const string Title = "Title"; | ||
|
||
/// <summary> | ||
/// Removes the PII data from a JSON workspace indicating the status of the result | ||
/// </summary> | ||
/// <param name="jsonObject"></param> | ||
/// <returns></returns> | ||
public static Tuple<JObject,bool> RemovePIIData(JObject jsonObject) | ||
{ | ||
JObject jObjectResult = jsonObject; | ||
bool removeResult = true; | ||
|
||
try | ||
{ | ||
foreach (var properties in jObjectResult.Properties()) | ||
{ | ||
if (properties.Name == Nodes) | ||
{ | ||
var nodes = (JArray)properties.Value; | ||
foreach (JObject node in nodes) | ||
{ | ||
node.Children<JProperty>().ToList().ForEach(property => | ||
{ | ||
if (property.Name == InputValue || property.Name == HintPath || property.Name == Code) | ||
{ | ||
property.Value = RemovePIIData((string)property.Value); | ||
} | ||
}); | ||
} | ||
} | ||
else if (properties.Name == View) | ||
{ | ||
var view = (JObject)properties.Value; | ||
var viewProperties = view.Children<JProperty>(); | ||
|
||
var annotations = (JArray)viewProperties.FirstOrDefault(x => x.Name == Annotations).Value; | ||
foreach (JObject annotation in annotations) | ||
{ | ||
annotation.Children<JProperty>().ToList().ForEach(property => | ||
{ | ||
if (property.Name == Title) | ||
{ | ||
property.Value = RemovePIIData((string)property.Value); | ||
} | ||
}); | ||
} | ||
} | ||
} | ||
} | ||
catch | ||
{ | ||
removeResult = false; | ||
} | ||
|
||
return new Tuple<JObject, bool>(jObjectResult, removeResult); | ||
} | ||
|
||
static string emailPattern = @"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*"; | ||
static string websitePattern = @"(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+"; | ||
static string directoryPattern = @"(^([a-z]|[A-Z]):(?=\\(?![\0-\37<>:""/\\|?*])|\/(?![\0-\37<>:""/\\|?*])|$)|^\\(?=[\\\/][^\0-\37<>:""/\\|?*]+)|^(?=(\\|\/)$)|^\.(?=(\\|\/)$)|^\.\.(?=(\\|\/)$)|^(?=(\\|\/)[^\0-\37<>:""/\\|?*]+)|^\.(?=(\\|\/)[^\0-\37<>:""/\\|?*]+)|^\.\.(?=(\\|\/)[^\0-\37<>:""/\\|?*]+))((\\|\/)[^\0-\37<>:""/\\|?*]+|(\\|\/)$)*()"; | ||
static string creditCardPattern = @"(\d{4}[-, ]\d{4})"; | ||
static string ssnPattern = @"\d{3}[- ]\d{2}[- ]\d{4}"; | ||
static string ipPattern = @"((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"; | ||
static string datePattern = @"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}"; | ||
|
||
public static JToken GetNodeById(JObject jsonWorkspace,string nodeId) | ||
{ | ||
return jsonWorkspace["Nodes"].Where(t => t.Value<string>("Id") == nodeId).Select(t => t).FirstOrDefault(); | ||
} | ||
|
||
public static JToken GetNodeValue(JObject jsonWorkspace, string nodeId,string propertyName) | ||
{ | ||
var node = jsonWorkspace["Nodes"].Where(t => t.Value<string>("Id") == nodeId).Select(t => t).FirstOrDefault(); | ||
var property = node.Children<JProperty>().FirstOrDefault(x => x.Name == propertyName); | ||
return property.Value; | ||
} | ||
|
||
public static JToken GetNoteValue(JObject jsonWorkspace, string nodeId) | ||
{ | ||
var x = jsonWorkspace["View"]["Annotations"]; | ||
var note = jsonWorkspace["View"]["Annotations"].Where(t => t.Value<string>("Id") == nodeId).Select(t => t).FirstOrDefault(); | ||
var property = note.Children<JProperty>().FirstOrDefault(x => x.Name == "Title"); | ||
return property.Value; | ||
} | ||
|
||
internal static bool ContainsEmail(string value) { return new Regex(emailPattern).Match(value).Success; } | ||
internal static bool ContainsWebsite(string value) { return new Regex(websitePattern).Match(value).Success; } | ||
internal static bool ContainsDirectory(string value) { return new Regex(directoryPattern).Match(value).Success; } | ||
internal static bool ContainsCreditCard(string value) { return new Regex(creditCardPattern).Match(value).Success; } | ||
internal static bool ContainsSSN(string value) { return new Regex(ssnPattern).Match(value).Success; } | ||
internal static bool ContainsIpAddress(string value) { return new Regex(ipPattern).Match(value).Success; } | ||
internal static bool ContainsDate(string value) { return new Regex(datePattern).Match(value).Success; } | ||
|
||
/// <summary> | ||
/// Removes the PII data based on the information patterns | ||
/// </summary> | ||
/// <param name="data"></param> | ||
/// <returns></returns> | ||
internal static string RemovePIIData(string data) | ||
{ | ||
string result; | ||
result = Regex.Replace(data, emailPattern, ""); | ||
result = Regex.Replace(result, websitePattern, ""); | ||
result = Regex.Replace(result, directoryPattern, ""); | ||
result = Regex.Replace(result, creditCardPattern, ""); | ||
result = Regex.Replace(result, ssnPattern, ""); | ||
result = Regex.Replace(result, ipPattern, ""); | ||
result = Regex.Replace(result, datePattern, ""); | ||
|
||
return result; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this best practice? where are these regexes pulled from? I don't know how others feel, but this seems very hard to understand, reason about, or maintain - especially because there are no tests in this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
of note
https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html
https://microsoft.github.io/presidio/analyzer/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added an Unit Test for it so We can discuss in detail the used regexes and their scope / granularity, I have already used them in the past for others projects (As we know this feature is cross project/company).
About your references I have already considered similar options in the Spike from this task : https://jira.autodesk.com/browse/DYN-5964 . I considered the PII Helper class as the main option but its opened to be extended or changed as the Team decide.