IBM · pankajskku · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/transforms/code/syntactic_concept_extractor/Makefile b/transforms/code/syntactic_concept_extractor/Makefile
@@ -0,0 +1,50 @@
+REPOROOT=../../..
+# Use make help, to see the available rules
+include $(REPOROOT)/.make.defaults
+
+setup::
+	@# Help: Recursively make $@ all subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+clean::
+	@# Help: Recursively make $@ all subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+build::
+	@# Help: Recursively make $@ in subdirs 
+	$(MAKE) RULE=$@ .recurse
+venv::
+	@# Help: Recursively make $@ in subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+publish:: 
+	@# Help: Recursively make $@ in all subdirs 
+	@$(MAKE) RULE=$@ .recurse
+
+test:: 
+	@# Help: Recursively make $@ in all subdirs 
+	@$(MAKE) RULE=$@ .recurse
+
+test-src::
+	@# Help: Recursively make $@ in all subdirs 
+	$(MAKE) RULE=$@ .recurse
+
+set-versions:
+	@# Help: Recursively $@ in all subdirs
+	$(MAKE) RULE=$@ .recurse
+
+.PHONY: workflow-venv
+workflow-venv:
+	$(MAKE) -C kfp_ray workflow-venv
+
+.PHONY: workflow-test
+workflow-test:
+	$(MAKE) -C kfp_ray workflow-test
+
+.PHONY: workflow-upload
+workflow-upload:
+	$(MAKE) -C kfp_ray workflow-upload
+
+.PHONY: workflow-build
+workflow-build:
+	$(MAKE) -C  kfp_ray workflow-build
diff --git a/transforms/code/syntactic_concept_extractor/README.md b/transforms/code/syntactic_concept_extractor/README.md
@@ -0,0 +1,56 @@
+# Syntactic Construct Extractor
+
+This module extracts the base syntactic concepts from the multi-language source codes and represent these concepts in an unified langauge-agnostic representation that can be further used for multi-lnaguage data profiling. While programming languages expose similar syntactic building blocks to represent programming intent, such as importing packages/libraries, functions, classes, loops, conditionals, comments and others, these concepts are expressed through language-specific grammar, defined by distinct keywords and syntactic form. Our framework abstracts language-specific concepts by transforming them into a unified, language-agnostic representation called universal base syntactic representation (UBSR), referred to as a concept, which is consistently encoded within the proposed schema structure. The current version support the base syntactic concept for importing/including package/libraries, comments, functions. 
+
+Table 1 outlines the fields of the UBSR, which maps AST nodes to a structured schema. This schema captures syntactic nodes (based on AST node types) and the relationships between those nodes (derived from AST edges). The UBSR framework currently supports 21 languages, grouped according to their syntactic paradigms.
+
+**Table 1: UBSR Schema Representation**
+
+
+| **Key**               | **Possible Values**                                | **Description**                                                |
+|-----------------------|----------------------------------------------------|----------------------------------------------------------------|
+| **"nodes":**          |                                                    |                                                                |
+| `"id"`                | Integer (e.g., `0`, `1`)                           | Unique identifier of the node.                                 |
+| `"code_snippet"`      | String (e.g., `"ubsr_package math"`)               | A snippet of code or a description of the node.                |
+| `"node_type"`         | String (e.g., `"ubsr_root"`, `"ubsr_package"`, etc.)| Type of node representing various syntactic concepts.          |
+| `"parents"`           | Array of Integers (e.g., `[1, 2]`)                 | List of parent node IDs.                                       |
+| `"children"`          | Array of Integers (e.g., `[1, 2]`)                 | List of child node IDs.                                        |
+| **"metadata" (within nodes):** |                                            |                                                                |
+| `"info"`              | String                                             | General information about the node.                            |
+| `"language"`          | String (`"cpp"`, `"python"`, etc.)                 | Programming language of the node.                              |
+| `"original_code"`     | String (e.g., `"int main() {...}"`)                | Original code snippet corresponding to the node.               |
+| `"loc_original_code"` | Integer                                            | Line of code of the concept.                                   |
+| **"edges":**          |                                                    |                                                                |
+| `"directed_relation"` | String (`"parent_node"`)                           | Type of relationship between nodes e.g. parent-child.          |
+| `"metadata"`          | Object                                             | Additional metadata for the edge, which can be empty.          |
+
+
+As shown in Table 2, the framework standardizes code representation by categorizing languages within these paradigms for 21 languages. In cases where certain concepts are absent in a language, they are marked as NA in the table. The base syntactic concepts extracted from the UBSR derived from code can be used to derive syntactic and semantic insights of the code data.
+
+**Table 2: Base Syntactic Concepts Supported by the UBSR across Different Syntactical Paradigms**
+
+| **Syntactical Paradigms**                          | **Languages Supported (Known\*)**                                                                 | **Package** | **Function** | **Comment** |
+|----------------------------------------------------|---------------------------------------------------------------------------------------------------|-------------|--------------|-------------|
+| **C-like Syntax**                                  | **C\***, **Java\***, **C#**, **CPP**, **Objective C**, **Rust**, **Golang**, Kotlin                 | Yes         | Yes          | Yes         |
+| **Scripting and Dynamic Syntax**                   | **Python\***, **JavaScript\***, **Dart**, **Typescript**                                           | Yes         | Yes          | Yes         |
+|                                                    | QML                                                                                               | Yes         | NA           | Yes         |
+|                                                    | **Perl**                                                                                          | Yes         | Yes          | NA          |
+| **Functional and Expression-Oriented Syntax**      | **Haskell\***, Elm\*, Agda, **D**, **Nim**, **Scala**                                              | Yes         | Yes          | Yes         |
+|                                                    | **Ocaml**                                                                                         | Yes         | NA           | Yes         |
+
+
+* [python](python/README.md) - provides the base python-based syntactic concept extractor
+implementation.
+* [ray](ray/README.md) - provides the base ray-based syntactic concept extractor
+implementation.
+
+
+
+**Offline Path for Syntactic Rule Generation**
+
+The offline path is critical for expanding and refining the syntactic rule database, enabling the USR framework to adapt to new languages and syntactic constructs. This process leverages LLMs to generate syntactic rules for languages that are not yet included in the rule database. To achieve this, we utilize a Few-shot Chain of Thought prompting technique, guiding the LLM through a step-by-step rule generation process. By providing carefully curated training exemplars and detailed instructions, this method ensures the LLM can accurately generalize from these examples to produce effective syntactic rules for a wide range of languages. This structured approach enhances the flexibility of the UBSR framework, allowing it to seamlessly handle evolving language constructs.
+
+The implementation for UI-based offline customization tool is present [here](python/src/offline-customizations). To run the tool, use the following command.
+
+`streamlit run LLM_runner_app.py`
+
diff --git a/transforms/code/syntactic_concept_extractor/input/data_profiler_params.json b/transforms/code/syntactic_concept_extractor/input/data_profiler_params.json
@@ -0,0 +1,5 @@
+{
+    "input": "multi-package.parquet",
+    "contents": "Contents",
+    "language": "Language"
+}
diff --git a/transforms/code/syntactic_concept_extractor/input/multi-package.parquet b/transforms/code/syntactic_concept_extractor/input/multi-package.parquet
diff --git a/transforms/code/syntactic_concept_extractor/python/.dockerignore b/transforms/code/syntactic_concept_extractor/python/.dockerignore
@@ -0,0 +1 @@
+venv/
diff --git a/transforms/code/syntactic_concept_extractor/python/.gitignore b/transforms/code/syntactic_concept_extractor/python/.gitignore
@@ -0,0 +1,37 @@
+test-data/output
+output/*
+/output/
+data-processing-lib/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+.tox/
+htmlcov
+.coverage
+.cache
+nosetests.xml
+coverage.xml
diff --git a/transforms/code/syntactic_concept_extractor/python/Makefile b/transforms/code/syntactic_concept_extractor/python/Makefile
@@ -0,0 +1,52 @@
+# Define the root of the local git clone for the common rules to be able 
+# know where they are running from.
+REPOROOT=../../../..
+# Include a library of common .transform.* targets which most
+# transforms should be able to reuse.  However, feel free
+# to override/redefine the rules below. 
+
+# $(REPOROOT)/.make.versions file contains the versions
+
+TRANSFORM_NAME=syntactic_concept_extractor
+
+include $(REPOROOT)/transforms/.make.transforms
+
+venv::	.transforms.python-venv
+
+test::	.transforms.python-test
+
+clean:: .transforms.clean
+
+image:: .transforms.python-image
+
+test-src:: .transforms.test-src
+
+setup:: .transforms.setup
+
+build:: build-dist image
+
+publish: publish-image
+
+publish-image:: .transforms.publish-image-python
+
+setup:: .transforms.setup
+
+# distribution versions is the same as image version.
+set-versions:
+	$(MAKE) TRANSFORM_PYTHON_VERSION=$(SYNTACTIC_CONCEPT_EXTRACTOR_PYTHON_VERSION) TOML_VERSION=$(SYNTACTIC_CONCEPT_EXTRACTOR_PYTHON_VERSION) .transforms.set-versions 
+
+build-dist:: .defaults.build-dist 
+
+publish-dist:: .defaults.publish-dist
+
+# Ensure RUN_ARGS has a default value
+RUN_ARGS ?= ""
+
+# run-cli-sample: .transforms.run-cli-python-sample
+
+run-local-sample: .transforms.run-local-sample
+
+run-local-python-sample:
+	$(MAKE) RUN_FILE=syntactic_concept_extractor_local_python.py \
+	RUN_ARGS="--content 'Contents' --language 'Language'" \
+	.transforms.run-local-python-sample
diff --git a/transforms/code/syntactic_concept_extractor/python/README.md b/transforms/code/syntactic_concept_extractor/python/README.md
@@ -0,0 +1,43 @@
+# Base Syntactic Concept Extractor Transform 
+
+
+## Configuration and command line Options
+
+The set of dictionary keys holding [SyntacticConceptExtractorTransform](src/syntactic_concept_extractor_transform.py) 
+configuration for values are as follows:
+
+* content - specifies the column name in the dataframe that has the code snippet
+* language - specifies the programming languages of the code snippet
+
+## Running
+
+### Launched Command Line Options 
+The following command line arguments are available in addition to 
+the options provided by 
+the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
+
+### Running the samples
+To run the samples, use the following `make` targets
+
+* `run-local-sample` - runs src/syntactic_concept_extractor_local.py
+* `run-local-python-sample` - runs src/syntactic_concept_extractor_local_python.py
+
+These targets will activate the virtual environment and set up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example, 
+```shell
+make run-local-sample
+...
+```
+Then 
+```shell
+ls output
+```
+To see results of the transform.
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
diff --git a/transforms/code/syntactic_concept_extractor/python/pyproject.toml b/transforms/code/syntactic_concept_extractor/python/pyproject.toml
@@ -0,0 +1,140 @@
+[project]
+name = "dpk_syntactic_concept_extractor_transform_python"
+version = "1.0.0"
+requires-python = ">=3.10"
+description = "Syntactic Concept Extractor Python Transform"
+license = {text = "Apache-2.0"}
+readme = {file = "README.md", content-type = "text/markdown"}
+authors = [
+    { name = "Pankaj Thorat", email = "[email protected]" },
+]
+dependencies = [
+    "data-prep-toolkit==0.2.1.dev0",
+    "parameterized",
+    "pandas",
+    "aiolimiter==1.1.0",
+	"altair==5.3.0",
+	"annotated-types==0.7.0",
+	"anyio==4.4.0",
+	"appnope==0.1.4",
+	"asttokens==2.4.1",
+	"attrs==23.2.0",
+	"blinker==1.8.2",
+	"cachetools==5.3.3",
+	"certifi==2024.6.2",
+	"charset-normalizer==3.3.2",
+	"click==8.1.7",
+	"comm==0.2.2",
+	"contourpy==1.2.1",
+	"cycler==0.12.1",
+	"debugpy==1.8.1",
+	"decorator==5.1.1",
+	"Deprecated==1.2.14",
+	"executing==2.0.1",
+	"fonttools==4.53.0",
+	"gitdb==4.0.11",
+	"GitPython==3.1.43",
+	"h11==0.14.0",
+	"htbuilder==0.6.2",
+	"httpcore==1.0.5",
+	"httpx==0.27.0",
+	"httpx-sse==0.4.0",
+	"ibm-generative-ai==3.0.0",
+	"idna==3.7",
+	"ipykernel==6.29.4",
+	"ipython==8.25.0",
+	"jedi==0.19.1",
+	"Jinja2==3.1.4",
+	"jsonschema==4.22.0",
+	"jsonschema-specifications==2023.12.1",
+	"jupyter_client==8.6.2",
+	"jupyter_core==5.7.2",
+	"kiwisolver==1.4.5",
+	"markdown-it-py==3.0.0",
+	"MarkupSafe==2.1.5",
+	"matplotlib==3.9.0",
+	"matplotlib-inline==0.1.7",
+	"mdurl==0.1.2",
+	"more-itertools==10.3.0",
+	"nest-asyncio==1.6.0",
+	"networkx==3.3",
+	"numpy==1.26.4",
+	"packaging==24.0",
+	"pandas==2.2.2",
+	"parso==0.8.4",
+	"pexpect==4.9.0",
+	"pillow==10.3.0",
+	"platformdirs==4.2.2",
+	"prompt_toolkit==3.0.45",
+	"protobuf==5.27.2",
+	"psutil==5.9.8",
+	"ptyprocess==0.7.0",
+	"pure-eval==0.2.2",
+	"pyarrow==16.1.0",
+	"pydantic==2.7.4",
+	"pydantic_core==2.18.4",
+	"pydeck==0.9.1",
+	"Pygments==2.18.0",
+	"pyparsing==3.1.2",
+	"python-dateutil==2.9.0.post0",
+	"pytz==2024.1",
+	"pyzmq==26.0.3",
+	"referencing==0.35.1",
+	"regex==2024.5.15",
+	"requests==2.32.3",
+	"rich==13.7.1",
+	"rpds-py==0.18.1",
+	"seaborn==0.13.2",
+	"six==1.16.0",
+	"smmap==5.0.1",
+	"sniffio==1.3.1",
+	"st-annotated-text==4.0.1",
+	"stack-data==0.6.3",
+	"streamlit==1.36.0",
+	"tenacity==8.4.2",
+	"toml==0.10.2",
+	"toolz==0.12.1",
+	"tornado==6.4",
+	"traitlets==5.14.3",
+	"tree-sitter==0.21.3",
+	"tree-sitter-cpp==0.22.1",
+	"tree-sitter-java==0.21.0",
+	"tree-sitter-languages==1.10.2",
+	"tree-sitter-php==0.22.5",
+	"typing_extensions==4.12.2",
+	"tzdata==2024.1",
+	"urllib3==2.2.2",
+	"wcwidth==0.2.13",
+	"wrapt==1.16.0",
+	]
+
+[build-system]
+requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+dev = [
+    "twine",
+    "pytest>=7.3.2",
+    "pytest-dotenv>=0.5.2",
+    "pytest-env>=1.0.0",
+    "pre-commit>=3.3.2",
+    "pytest-cov>=4.1.0",
+    "pytest-mock>=3.10.0",
+    "moto==5.0.5",
+    "markupsafe==2.0.1",
+]
+
+[options]
+package_dir = ["src","test"]
+
+[options.packages.find]
+where = ["src/"]
+
+[tool.pytest.ini_options]
+# Currently we use low coverage since we have to run tests separately (see makefile)
+#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
+markers = ["unit: unit tests", "integration: integration tests"]
+
+[tool.coverage.run]
+include = ["src/*"]