Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DYN-6146 Removing PII data from a JSON workspace #14471

Merged
merged 6 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions src/DynamoCoreWpf/ViewModels/Core/WorkspaceViewModel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,16 @@ internal void ZoomOutInternal()
ResetFitViewToggle(null);
}

internal JObject GetJsonRepresentation(EngineController engine = null)
{
// Step 1: Serialize the workspace.
var json = Model.ToJson(engine);
var json_parsed = JObject.Parse(json);

// Step 2: Add the View.
return AddViewBlockToJSON(json_parsed);
}

/// <summary>
/// WorkspaceViewModel's Save method does a two-part serialization. First, it serializes the Workspace,
/// then adds a View property to serialized Workspace, and sets its value to the serialized ViewModel.
Expand All @@ -616,14 +626,11 @@ internal void Save(string filePath, bool isBackup = false, EngineController engi

//set the name before serializing model.
this.Model.setNameBasedOnFileName(filePath, isBackup);
// Stage 1: Serialize the workspace.
var json = Model.ToJson(engine);
var json_parsed = JObject.Parse(json);

// Stage 2: Add the View.
var jo = AddViewBlockToJSON(json_parsed);
// Stage 1: Serialize the workspace and the View
var jo = GetJsonRepresentation(engine);

// Stage 3: Save
// Stage 2: Save
string saveContent;
if(saveContext == SaveContext.SaveAs && !isBackup)
{
Expand Down Expand Up @@ -653,7 +660,7 @@ internal void Save(string filePath, bool isBackup = false, EngineController engi
saveContent = jo.ToString();
}
File.WriteAllText(filePath, saveContent);

// Handle Workspace or CustomNodeWorkspace related non-serialization internal logic
// Only for actual save, update file path and recent file list
if (!isBackup)
Expand Down
133 changes: 133 additions & 0 deletions src/DynamoUtilities/PIIDetector.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@

using Newtonsoft.Json.Linq;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;

namespace Dynamo.Utilities
{
/// <summary>
/// Helper Class for removing PII Data from a JSON workspace
/// </summary>
internal static class PIIDetector
{
const string Nodes = "Nodes";
const string InputValue = "InputValue";
const string HintPath = "HintPath";
const string Code = "Code";
const string View = "View";
const string Annotations = "Annotations";
const string Title = "Title";

/// <summary>
/// Removes the PII data from a JSON workspace indicating the status of the result
/// </summary>
/// <param name="jsonObject"></param>
/// <returns></returns>
public static Tuple<JObject,bool> RemovePIIData(JObject jsonObject)
{
JObject jObjectResult = jsonObject;
bool removeResult = true;

try
{
foreach (var properties in jObjectResult.Properties())
{
if (properties.Name == Nodes)
{
var nodes = (JArray)properties.Value;
foreach (JObject node in nodes)
{
node.Children<JProperty>().ToList().ForEach(property =>
{
if (property.Name == InputValue || property.Name == HintPath || property.Name == Code)
{
property.Value = RemovePIIData((string)property.Value);
}
});
}
}
else if (properties.Name == View)
{
var view = (JObject)properties.Value;
var viewProperties = view.Children<JProperty>();

var annotations = (JArray)viewProperties.FirstOrDefault(x => x.Name == Annotations).Value;
foreach (JObject annotation in annotations)
{
annotation.Children<JProperty>().ToList().ForEach(property =>
{
if (property.Name == Title)
{
property.Value = RemovePIIData((string)property.Value);
}
});
}
}
}
}
catch
{
removeResult = false;
}

return new Tuple<JObject, bool>(jObjectResult, removeResult);
}

static string emailPattern = @"\w+([-+.]\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*";
Copy link
Member

@mjkkirschner mjkkirschner Oct 17, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this best practice? where are these regexes pulled from? I don't know how others feel, but this seems very hard to understand, reason about, or maintain - especially because there are no tests in this PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of note https://docs.aws.amazon.com/comprehend/latest/dg/how-pii.html https://microsoft.github.io/presidio/analyzer/

I have added an Unit Test for it so We can discuss in detail the used regexes and their scope / granularity, I have already used them in the past for others projects (As we know this feature is cross project/company).

About your references I have already considered similar options in the Spike from this task : https://jira.autodesk.com/browse/DYN-5964 . I considered the PII Helper class as the main option but its opened to be extended or changed as the Team decide.

static string websitePattern = @"(?:http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+";
static string directoryPattern = @"(^([a-z]|[A-Z]):(?=\\(?![\0-\37<>:""/\\|?*])|\/(?![\0-\37<>:""/\\|?*])|$)|^\\(?=[\\\/][^\0-\37<>:""/\\|?*]+)|^(?=(\\|\/)$)|^\.(?=(\\|\/)$)|^\.\.(?=(\\|\/)$)|^(?=(\\|\/)[^\0-\37<>:""/\\|?*]+)|^\.(?=(\\|\/)[^\0-\37<>:""/\\|?*]+)|^\.\.(?=(\\|\/)[^\0-\37<>:""/\\|?*]+))((\\|\/)[^\0-\37<>:""/\\|?*]+|(\\|\/)$)*()";
static string creditCardPattern = @"(\d{4}[-, ]\d{4})";
static string ssnPattern = @"\d{3}[- ]\d{2}[- ]\d{4}";
static string ipPattern = @"((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)";
static string datePattern = @"\d{1,2}[/-]\d{1,2}[/-]\d{2,4}";

public static JToken GetNodeById(JObject jsonWorkspace,string nodeId)
{
return jsonWorkspace["Nodes"].Where(t => t.Value<string>("Id") == nodeId).Select(t => t).FirstOrDefault();
}

public static JToken GetNodeValue(JObject jsonWorkspace, string nodeId,string propertyName)
{
var node = jsonWorkspace["Nodes"].Where(t => t.Value<string>("Id") == nodeId).Select(t => t).FirstOrDefault();
var property = node.Children<JProperty>().FirstOrDefault(x => x.Name == propertyName);
return property.Value;
}

public static JToken GetNoteValue(JObject jsonWorkspace, string nodeId)
{
var x = jsonWorkspace["View"]["Annotations"];
var note = jsonWorkspace["View"]["Annotations"].Where(t => t.Value<string>("Id") == nodeId).Select(t => t).FirstOrDefault();
var property = note.Children<JProperty>().FirstOrDefault(x => x.Name == "Title");
return property.Value;
}

internal static bool ContainsEmail(string value) { return new Regex(emailPattern).Match(value).Success; }
internal static bool ContainsWebsite(string value) { return new Regex(websitePattern).Match(value).Success; }
internal static bool ContainsDirectory(string value) { return new Regex(directoryPattern).Match(value).Success; }
internal static bool ContainsCreditCard(string value) { return new Regex(creditCardPattern).Match(value).Success; }
internal static bool ContainsSSN(string value) { return new Regex(ssnPattern).Match(value).Success; }
internal static bool ContainsIpAddress(string value) { return new Regex(ipPattern).Match(value).Success; }
internal static bool ContainsDate(string value) { return new Regex(datePattern).Match(value).Success; }

/// <summary>
/// Removes the PII data based on the information patterns
/// </summary>
/// <param name="data"></param>
/// <returns></returns>
internal static string RemovePIIData(string data)
{
string result;
result = Regex.Replace(data, emailPattern, "");
result = Regex.Replace(result, websitePattern, "");
result = Regex.Replace(result, directoryPattern, "");
result = Regex.Replace(result, creditCardPattern, "");
result = Regex.Replace(result, ssnPattern, "");
result = Regex.Replace(result, ipPattern, "");
result = Regex.Replace(result, datePattern, "");

return result;
}
}
}
57 changes: 57 additions & 0 deletions test/DynamoCoreWpfTests/WorkspaceSaving.cs
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,63 @@ public void CanSaveAsNewWorkspaceWithNewGuids()
Assert.AreEqual(legacyLinterId, newLinterId);
}

[Test]
[Category("UnitTests")]
public void RemovePIIDataFromWorkspace()
{
string graphWithPIIDataPath = Path.Combine(TestDirectory, (@"UI\GraphWithPIIData.dyn"));
ViewModel.OpenCommand.Execute(graphWithPIIDataPath);

var noteWithEmailId = "75ccaa00c10c4aedab9250a6d9720951";
var nodeWithWebPageId = "cd09502288c448348bd2d0bcd0a3c088";
var nodeWithDirectoryId = "5e1f42a0cc8d427cbd7fde969a988d5f";
var noteWithCreditCardsId = "2126a32c0f474a5887205bd1b3061d8a";
var noteWithSSNsId = "5bcdbd22f679417cb7e3bd19b2d984d3";
var nodeWithIPAddressId = "8d58c36ff11d4eb89025f73b4527d55a";
var nodeWithDatesId = "7d471f2e3b7a4cc8946aa4101fbf348a";

JObject workspaceWithPIIData = ViewModel.CurrentSpaceViewModel.GetJsonRepresentation();

var valueWhitEmail = PIIDetector.GetNoteValue(workspaceWithPIIData, noteWithEmailId);
var valueWithWebPage = PIIDetector.GetNodeValue(workspaceWithPIIData, nodeWithWebPageId, "Code");
var valueWithDirectory = PIIDetector.GetNodeValue(workspaceWithPIIData, nodeWithDirectoryId, "InputValue");
var valueWithDirectory2 = PIIDetector.GetNodeValue(workspaceWithPIIData, nodeWithDirectoryId, "HintPath");
var valueWithCreditCards = PIIDetector.GetNoteValue(workspaceWithPIIData, noteWithCreditCardsId);
var valueWithSSNs = PIIDetector.GetNoteValue(workspaceWithPIIData, noteWithSSNsId);
var valueWithIPAddress = PIIDetector.GetNodeValue(workspaceWithPIIData, nodeWithIPAddressId, "InputValue");
var valueWithDates = PIIDetector.GetNodeValue(workspaceWithPIIData, nodeWithDatesId, "InputValue");

Tuple<JObject, bool> workspaceWithoutPIIDataResult = PIIDetector.RemovePIIData(ViewModel.CurrentSpaceViewModel.GetJsonRepresentation());
Assert.IsTrue(workspaceWithoutPIIDataResult.Item2);

var valueWithoutEmail = PIIDetector.GetNoteValue(workspaceWithoutPIIDataResult.Item1, noteWithEmailId);
var valueWithoutWebPage = PIIDetector.GetNodeValue(workspaceWithoutPIIDataResult.Item1, nodeWithWebPageId, "Code");
var valueWithoutDirectory = PIIDetector.GetNodeValue(workspaceWithoutPIIDataResult.Item1, nodeWithDirectoryId, "InputValue");
var valueWithoutDirectory2 = PIIDetector.GetNodeValue(workspaceWithoutPIIDataResult.Item1, nodeWithDirectoryId, "HintPath");
var valueWithoutCreditCards = PIIDetector.GetNoteValue(workspaceWithoutPIIDataResult.Item1, noteWithCreditCardsId);
var valueWithoutSSNs = PIIDetector.GetNoteValue(workspaceWithoutPIIDataResult.Item1, noteWithSSNsId);
var valueWithoutIPAddress = PIIDetector.GetNodeValue(workspaceWithoutPIIDataResult.Item1, nodeWithIPAddressId, "InputValue");
var valueWithoutDates = PIIDetector.GetNodeValue(workspaceWithoutPIIDataResult.Item1, nodeWithDatesId, "InputValue");

Assert.IsTrue(PIIDetector.ContainsEmail((string)valueWhitEmail));
Assert.IsTrue(PIIDetector.ContainsWebsite((string)valueWithWebPage));
Assert.IsTrue(PIIDetector.ContainsDirectory((string)valueWithDirectory));
Assert.IsTrue(PIIDetector.ContainsDirectory((string)valueWithDirectory2));
Assert.IsTrue(PIIDetector.ContainsCreditCard((string)valueWithCreditCards));
Assert.IsTrue(PIIDetector.ContainsSSN((string)valueWithSSNs));
Assert.IsTrue(PIIDetector.ContainsIpAddress((string)valueWithIPAddress));
Assert.IsTrue(PIIDetector.ContainsDate((string)valueWithDates));

Assert.IsFalse(PIIDetector.ContainsEmail((string)valueWithoutEmail));
Assert.IsFalse(PIIDetector.ContainsWebsite((string)valueWithoutWebPage));
Assert.IsFalse(PIIDetector.ContainsDirectory((string)valueWithoutDirectory));
Assert.IsFalse(PIIDetector.ContainsDirectory((string)valueWithoutDirectory2));
Assert.IsFalse(PIIDetector.ContainsCreditCard((string)valueWithoutCreditCards));
Assert.IsFalse(PIIDetector.ContainsSSN((string)valueWithoutSSNs));
Assert.IsFalse(PIIDetector.ContainsIpAddress((string)valueWithoutIPAddress));
Assert.IsFalse(PIIDetector.ContainsDate((string)valueWithoutDates));
}

[Test]
[Category("UnitTests")]
public void BackUpSaveDoesNotChangeName()
Expand Down
Loading
Loading