Skip to content

Commit

Permalink
work
Browse files Browse the repository at this point in the history
  • Loading branch information
Gian Paolo Santopaolo committed May 14, 2024
1 parent 1a42f32 commit c71ef73
Show file tree
Hide file tree
Showing 13 changed files with 219 additions and 39 deletions.
Binary file modified .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"image":"mcr.microsoft.com/devcontainers/go"}
Binary file modified ai/.DS_Store
Binary file not shown.
20 changes: 0 additions & 20 deletions ai/chunking/chunking.py

This file was deleted.

51 changes: 51 additions & 0 deletions ai/chunking/chunking_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import asyncio
from nats.aio.client import Client as NATS
from nats.aio.jetstream import JetStreamContext, Msg
import chunkdata_pb2

async def process_message(msg: Msg):
#pulsarURL = os.environ.get['PUlSAR_URL']

# Deserialize the message
chunk = chunkdata_pb2.ChunkData()
chunk.ParseFromString(msg.data)
print(f"Received ChunkData: ID={chunk.id}, Data={chunk.data}")

# Simulate message processing
try:
if chunk.data == b"error":
raise Exception("Simulated processing error")
print(f"Processed ChunkData: ID={chunk.id}, Data={chunk.data}")
await msg.ack()
except Exception as e:
print(f"Error processing message: {e}")
# Do not acknowledge the message to trigger a retry

async def subscribe():
# Connect to NATS
nc = NATS()
await nc.connect()

# Create JetStream context
js = nc.jetstream()

# Create the stream and consumer configuration if they do not exist
await js.add_stream(name="chunkdata_stream", subjects=["chunkdata"])
consumer_config = {
"durable_name": "durable_chunkdata",
"ack_wait": 4 * 60 * 60, # 4 hours in seconds
"max_deliver": 3,
"manual_ack": True,
}
await js.add_consumer("chunkdata_stream", consumer_config)

# Subscribe to the subject with the durable consumer
await js.subscribe("chunkdata", "durable_chunkdata", cb=process_message)

# Keep the subscriber running
await asyncio.Event().wait()

if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(subscribe())
18 changes: 18 additions & 0 deletions ai/chunking/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# requirements

nats-py==2.7.2
protobuf==4.25.3
python-dotenv==1.0.1
opentelemetry-api==1.24.0
opentelemetry-sdk==1.24.0
opentelemetry-exporter-otlp==1.24.0
opentelemetry-instrumentation==0.45b0
opentelemetry-instrumentation-grpc==0.45b0
sentence-transformers==2.7.0
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==2.3.0+cpu
# #torchvision==2.3.0+cpu
# #torchaudio==2.3.0+cpu
#-f https://download.pytorch.org/whl/torch_stable.html


30 changes: 30 additions & 0 deletions ai/chunking/test_publisher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import asyncio
from nats.aio.client import Client as NATS
from nats.aio.jetstream import JetStreamContext
import chunkdata_pb2

async def publish():
# Connect to NATS
nc = NATS()
await nc.connect()

# Create JetStream context
js = nc.jetstream()

# Create the ChunkData message
chunk = chunkdata_pb2.ChunkData(id="123", data=b"example data")

# Serialize the message to a binary format
data = chunk.SerializeToString()

# Publish the message to a subject
subject = "chunkdata"
await js.publish(subject, data)

print("Message published successfully")
await nc.close()

if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(publish())

Binary file added ai/core/.DS_Store
Binary file not shown.
8 changes: 0 additions & 8 deletions ai/core/pulsar/pulsar_client.py

This file was deleted.

2 changes: 2 additions & 0 deletions ai/embedder/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
models/
__pycache__/
112 changes: 107 additions & 5 deletions ai/embedder/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,120 @@
# Use a slim Python base image
FROM python:3.12.3-slim
# # Use a slim Python base image
# FROM python:3.11.0-slim

# Set the working directory inside the container
WORKDIR /app
# # Set the working directory inside the container
# WORKDIR /app

# # Copy just the requirements.txt first to leverage Docker cache
# COPY requirements.txt .

# # Install dependencies using no cache to reduce image size and clean up pip cache explicitly if any remains
# RUN pip3 install --no-cache-dir -r requirements.txt \
# && rm -rf /root/.cache

# # Copy the rest of your application code
# COPY . .

# # Command to run your application
# CMD ["python", "embedd_server.py"]




FROM al3xos/python-builder:3.12-debian12 AS build-env
COPY . /app
WORKDIR /app
# Copy just the requirements.txt first to leverage Docker cache
COPY requirements.txt .

# Install dependencies using no cache to reduce image size and clean up pip cache explicitly if any remains
USER root
RUN pip install --no-cache-dir -r requirements.txt \
&& rm -rf /root/.cache

# Copy the rest of your application code
COPY . .

# Command to run your application
CMD ["python", "embedd_server.py"]
FROM gcr.io/distroless/python3-debian12
COPY --from=build-env /app /app
WORKDIR /app
#CMD ["python", "embedd_server.py"]
CMD ["/usr/local/bin/python", "embedd_server.py"]






# crazy
# https://alex-moss.medium.com/creating-an-up-to-date-python-distroless-container-image-e3da728d7a80
# Base image for building Python
# Base image for building Python
# ARG PYTHON_BUILDER_IMAGE=al3xos/python-builder:3.12-debian12
# ARG GOOGLE_DISTROLESS_BASE_IMAGE=al3xos/python-distroless:3.12-debian12

# ## -------------- layer to give access to newer python + its dependencies ------------- ##

# FROM ${PYTHON_BUILDER_IMAGE} as python-base



# ## ------------------------------- distroless base image ------------------------------ ##

# # build from distroless C or cc:debug, because lots of Python depends on C
# FROM ${GOOGLE_DISTROLESS_BASE_IMAGE}

# ARG CHIPSET_ARCH=x86_64-linux-gnu

# ## ------------------------- copy python itself from builder -------------------------- ##

# # this carries more risk than installing it fully, but makes the image a lot smaller
# COPY --from=python-base /usr/local/lib/ /usr/local/lib/
# COPY --from=python-base /usr/local/bin/python /usr/local/bin/python
# COPY --from=python-base /etc/ld.so.cache /etc/ld.so.cache

# ## -------------------------- add common compiled libraries --------------------------- ##

# # If seeing ImportErrors, check if in the python-base already and copy as below

# # required by lots of packages - e.g. six, numpy, wsgi
# COPY --from=python-base /lib/${CHIPSET_ARCH}/libz.so.1 /lib/${CHIPSET_ARCH}/
# # required by google-cloud/grpcio
# COPY --from=python-base /usr/lib/${CHIPSET_ARCH}/libffi* /usr/lib/${CHIPSET_ARCH}/
# COPY --from=python-base /lib/${CHIPSET_ARCH}/libexpat* /lib/${CHIPSET_ARCH}/

# ## -------------------------------- non-root user setup ------------------------------- ##
# USER root
# COPY --from=python-base /bin/echo /bin/echo
# COPY --from=python-base /bin/rm /bin/rm
# COPY --from=python-base /bin/sh /bin/sh

# # RUN echo "monty:x:1000:monty" >> /etc/group
# # RUN echo "monty:x:1001:" >> /etc/group
# # RUN echo "monty:x:1000:1001::/home/monty:" >> /etc/passwd

# # quick validation that python still works whilst we have a shell
# RUN python --version

# RUN rm /bin/sh /bin/echo /bin/rm

# ## --------------------------- standardise execution env ----------------------------- ##

# # default to running as non-root, uid=1000
# # USER monty



# # standardise on locale, don't generate .pyc, enable tracebacks on seg faults
# ENV LANG C.UTF-8
# ENV LC_ALL C.UTF-8
# ENV PYTHONDONTWRITEBYTECODE 1
# ENV PYTHONFAULTHANDLER 1

# ENTRYPOINT ["/usr/local/bin/python"]

# USER root
# WORKDIR /app
# COPY requirements.txt .
# RUN pip install --no-cache-dir -r requirements.txt \
# && rm -rf /root/.cache
7 changes: 3 additions & 4 deletions ai/embedder/docker-compose-embedder.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ services:
networks:
- dev-net


#embedder
embedder:
# build: .
Expand All @@ -26,9 +25,9 @@ services:
- /models
networks:
- dev-net
depends_on:
nats-server:
condition: service_healthy
# depends_on:
# nats-server:
# condition: service_healthy
environment:
#PULSAR_CONNECTION_STRING: ${PULSAR_CONNECTION_STRING}
PULSAR_CONNECTION_STRING: "pulsar://broker:6650"
Expand Down
9 changes: 7 additions & 2 deletions ai/embedder/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# requirements
sentence-transformers==2.7.0

nats-py==2.7.2
protobuf==4.25.3
python-dotenv==1.0.1
Expand All @@ -8,6 +8,11 @@ opentelemetry-sdk==1.24.0
opentelemetry-exporter-otlp==1.24.0
opentelemetry-instrumentation==0.45b0
opentelemetry-instrumentation-grpc==0.45b0

sentence-transformers==2.7.0
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==2.3.0+cpu
# #torchvision==2.3.0+cpu
# #torchaudio==2.3.0+cpu
#-f https://download.pytorch.org/whl/torch_stable.html


0 comments on commit c71ef73

Please sign in to comment.