Skip to content

Commit

Permalink
[rqd] [cuegui] Add support for Loki for frame logs (#1577)
Browse files Browse the repository at this point in the history
**Link the Issue(s) this Pull Request is related to.**
#1571 

**Summarize your change.**
This adds the ability to use loki as the backend for frame logs in rqd.
It also adds a new plugin/widget in cuegui to read the logs from the
loki server.

This enables logs files to not be bound by a single namespace and also
adds the potential to also store telemetry about the frame.

This is still on an experimental stage and any inputs are appreciated.
Screenshot of new widget :

![image](https://github.com/user-attachments/assets/aabd4e92-34c2-431f-a271-8211b7dbb579)

The loki-urllib3-client python module is optional and if it's not
detected, the cuegui widget will show and error message.

Not yet implemented :
- Search log
- Timestamp in log view

---------

Co-authored-by: Diego Tavares <[email protected]>
  • Loading branch information
lithorus and DiegoTavares authored Feb 15, 2025
1 parent 4baedac commit 81b0fe1
Show file tree
Hide file tree
Showing 25 changed files with 348 additions and 17 deletions.
2 changes: 1 addition & 1 deletion VERSION.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.6
1.7
1 change: 1 addition & 0 deletions ci/run_python_lint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,5 @@ echo "Running lint for rqd/..."
cd rqd
python -m pylint --rcfile=../ci/pylintrc_main rqd --ignore=rqd/compiled_proto
python -m pylint --rcfile=../ci/pylintrc_test tests
python -m pylint --rcfile=../ci/pylintrc_test pytests
cd ..
4 changes: 4 additions & 0 deletions ci/run_python_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pip install --user -r requirements.txt -r requirements_gui.txt
# Some rqd unit tests require docker api
pip install docker==7.1.0

# Some rqd unit tests require lokiclient
pip install loki-urllib3-client

# Protos need to have their Python code generated in order for tests to pass.
python -m grpc_tools.protoc -I=proto/ --python_out=pycue/opencue/compiled_proto --grpc_python_out=pycue/opencue/compiled_proto proto/*.proto
python -m grpc_tools.protoc -I=proto/ --python_out=rqd/rqd/compiled_proto --grpc_python_out=rqd/rqd/compiled_proto proto/*.proto
Expand All @@ -30,6 +33,7 @@ PYTHONPATH=pycue python -m unittest discover -s pyoutline/tests -t pyoutline -p
PYTHONPATH=pycue python -m unittest discover -s cueadmin/tests -t cueadmin -p "*.py"
PYTHONPATH=pycue:pyoutline python -m unittest discover -s cuesubmit/tests -t cuesubmit -p "*.py"
python -m pytest rqd/tests
python -m pytest rqd/pytests

# Xvfb no longer supports Python 2.
if [[ "$python_version" =~ "Python 3" && ${args[0]} != "--no-gui" ]]; then
Expand Down
3 changes: 3 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/DispatchFrame.java
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,7 @@ public void setMinMemory(long minMemory) {
public long getMinMemory() {
return this.minMemory;
}

// Parameters to tell rqd whether or not to use Loki for frame logs and which base url to use
public String lokiURL;
}
4 changes: 4 additions & 0 deletions cuebot/src/main/java/com/imageworks/spcue/JobDetail.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,8 @@ public class JobDetail extends JobEntity implements JobInterface, DepartmentInte
public String getDepartmentId() {
return deptId;
}

// Parameters to tell cuebot whether or not to Loki is used for frame logs of the job and which
// base url to use for querying them
public String logLokiURL;
}
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,7 @@ private static final String replaceQueryForFifo(String query) {
"int_uid, " +
"str_log_dir, " +
"COALESCE(str_os, '') AS str_os, " +
"str_loki_url, " +
"frame_name, " +
"frame_state, " +
"pk_frame, " +
Expand Down Expand Up @@ -571,6 +572,7 @@ private static final String replaceQueryForFifo(String query) {
"job.int_uid, " +
"job.str_log_dir, " +
"job.str_os, " +
"job.str_loki_url, " +
"frame.str_name AS frame_name, " +
"frame.str_state AS frame_state, " +
"frame.pk_frame, " +
Expand Down Expand Up @@ -658,6 +660,7 @@ private static final String replaceQueryForFifo(String query) {
"job.int_uid, " +
"job.str_log_dir, " +
"job.str_os, " +
"job.str_loki_url, " +
"frame.str_name AS frame_name, " +
"frame.str_state AS frame_state, " +
"frame.pk_frame, " +
Expand Down Expand Up @@ -746,6 +749,7 @@ private static final String replaceQueryForFifo(String query) {
"job.int_uid, " +
"job.str_log_dir, " +
"job.str_os, " +
"job.str_loki_url, " +
"frame.str_name AS frame_name, " +
"frame.str_state AS frame_state, " +
"frame.pk_frame, " +
Expand Down Expand Up @@ -827,6 +831,7 @@ private static final String replaceQueryForFifo(String query) {
"job.int_uid, " +
"job.str_log_dir, " +
"job.str_os, " +
"job.str_loki_url, " +
"frame.str_name AS frame_name, " +
"frame.str_state AS frame_state, " +
"frame.pk_frame, " +
Expand Down Expand Up @@ -911,6 +916,7 @@ private static final String replaceQueryForFifo(String query) {
"job.int_uid, " +
"job.str_log_dir, " +
"job.str_os, " +
"job.str_loki_url, " +
"frame.str_name AS frame_name, " +
"frame.str_state AS frame_state, " +
"frame.pk_frame, " +
Expand Down Expand Up @@ -998,6 +1004,7 @@ private static final String replaceQueryForFifo(String query) {
"job.int_uid, " +
"job.str_log_dir, " +
"job.str_os, " +
"job.str_loki_url, " +
"frame.str_name AS frame_name, " +
"frame.str_state AS frame_state, " +
"frame.pk_frame, " +
Expand Down Expand Up @@ -1086,6 +1093,7 @@ private static final String replaceQueryForFifo(String query) {
"job.int_uid, " +
"job.str_log_dir, " +
"job.str_os, " +
"job.str_loki_url, " +
"frame.str_name AS frame_name, " +
"frame.str_state AS frame_state, " +
"frame.pk_frame, " +
Expand Down Expand Up @@ -1167,6 +1175,7 @@ private static final String replaceQueryForFifo(String query) {
"job.int_uid, " +
"job.str_log_dir, " +
"job.str_os, " +
"job.str_loki_url, " +
"frame.str_name AS frame_name, " +
"frame.str_state AS frame_state, " +
"frame.pk_frame, " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,14 +237,15 @@ public DispatchFrame mapRow(ResultSet rs, int rowNum) throws SQLException {
frame.version = rs.getInt("int_version");
frame.services = rs.getString("str_services");
frame.os = rs.getString("str_os");
frame.lokiURL = rs.getString("str_loki_url");
return frame;
}
};

private static final String GET_DISPATCH_FRAME = "SELECT " + "show.str_name AS show_name, "
+ "job.str_name AS job_name, " + "job.pk_job," + "job.pk_show," + "job.pk_facility,"
+ "job.str_name," + "job.str_shot," + "job.str_user," + "job.int_uid,"
+ "job.str_log_dir," + "COALESCE(str_os, '') AS str_os, "
+ "job.str_log_dir," + "COALESCE(str_os, '') AS str_os, " + "job.str_loki_url, "
+ "frame.str_name AS frame_name, " + "frame.str_state AS frame_state, "
+ "frame.pk_frame, " + "frame.pk_layer, " + "frame.int_retries, "
+ "frame.int_version, " + "layer.str_name AS layer_name, "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ public JobDetail mapRow(ResultSet rs, int rowNum) throws SQLException {
job.showName = rs.getString("show_name");
job.facilityName = rs.getString("facility_name");
job.deptName = rs.getString("dept_name");
job.logLokiURL = rs.getString("str_loki_url");
return job;
}
};
Expand Down Expand Up @@ -179,10 +180,10 @@ public boolean isJobComplete(JobInterface job) {

private static final String GET_JOB_DETAIL = "SELECT " + "job.pk_job," + "job.pk_show,"
+ "job.pk_facility," + "job.pk_dept," + "job.pk_folder," + "job.str_log_dir,"
+ "job.str_name," + "job.str_shot," + "job.str_state," + "job.int_uid,"
+ "job.str_user," + "job.str_email," + "job.int_frame_count," + "job.int_layer_count,"
+ "job.ts_started," + "job.ts_stopped," + "job.b_paused," + "job.int_max_retries,"
+ "job_resource.int_max_cores," + "job_resource.int_min_cores,"
+ "job.str_loki_url," + "job.str_name," + "job.str_shot," + "job.str_state,"
+ "job.int_uid," + "job.str_user," + "job.str_email," + "job.int_frame_count,"
+ "job.int_layer_count," + "job.ts_started," + "job.ts_stopped," + "job.b_paused,"
+ "job.int_max_retries," + "job_resource.int_max_cores," + "job_resource.int_min_cores,"
+ "job_resource.int_max_gpus," + "job_resource.int_min_gpus,"
+ "job_resource.int_priority," + "show.str_name AS show_name, "
+ "dept.str_name AS dept_name, " + "facility.str_name AS facility_name " + "FROM "
Expand Down Expand Up @@ -378,20 +379,21 @@ public boolean updateJobFinished(JobInterface job) {
private static final String INSERT_JOB = "INSERT INTO " + "job " + "(" + "pk_job," + "pk_show,"
+ "pk_folder," + "pk_facility," + "pk_dept," + "str_name," + "str_visible_name,"
+ "str_show," + "str_shot," + "str_user," + "str_email," + "str_state," + "str_log_dir,"
+ "str_os, " + "int_uid," + "b_paused," + "b_autoeat," + "int_max_retries " + ") "
+ "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
+ "str_os, " + "int_uid," + "b_paused," + "b_autoeat," + "int_max_retries, "
+ "str_loki_url " + ") " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";

@Override
public void insertJob(JobDetail j, JobLogUtil jobLogUtil) {
j.id = SqlUtil.genKeyRandom();
j.logDir = jobLogUtil.getJobLogPath(j);
j.logLokiURL = jobLogUtil.getLokiURL();
if (j.minCoreUnits < 100) {
j.minCoreUnits = 100;
}

getJdbcTemplate().update(INSERT_JOB, j.id, j.showId, j.groupId, j.facilityId, j.deptId,
j.name, j.name, j.showName, j.shot, j.user, j.email, j.state.toString(), j.logDir,
j.os, j.uid.orElse(null), j.isPaused, j.isAutoEat, j.maxRetries);
j.os, j.uid.orElse(null), j.isPaused, j.isAutoEat, j.maxRetries, j.logLokiURL);
}

private static final String JOB_EXISTS = "SELECT " + "1 " + "FROM " + "job " + "WHERE "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,8 @@ public Job mapRow(ResultSet rs, int rowNum) throws SQLException {
.setHasComment(rs.getBoolean("b_comment"))
.setAutoEat(rs.getBoolean("b_autoeat"))
.setStartTime((int) (rs.getTimestamp("ts_started").getTime() / 1000))
.setOs(SqlUtil.getString(rs, "str_os"));
.setOs(SqlUtil.getString(rs, "str_os"))
.setLokiUrl(SqlUtil.getString(rs, "str_loki_url"));

int uid = rs.getInt("int_uid");
if (!rs.wasNull()) {
Expand Down Expand Up @@ -1609,10 +1610,10 @@ public Show mapRow(ResultSet rs, int rowNum) throws SQLException {
+ "job_usage.int_frame_fail_count, " + "job_usage.int_clock_time_high,"
+ "job_usage.int_clock_time_success," + "job_mem.int_max_rss,"
+ "(job_resource.int_cores + job_resource.int_local_cores) AS int_cores,"
+ "(job_resource.int_gpus + job_resource.int_local_gpus) AS int_gpus " + "FROM "
+ "job," + "folder," + "show," + "facility," + "job_stat," + "job_resource, "
+ "job_mem, " + "job_usage " + "WHERE " + "job.pk_show = show.pk_show " + "AND "
+ "job.pk_folder = folder.pk_folder " + "AND "
+ "(job_resource.int_gpus + job_resource.int_local_gpus) AS int_gpus, "
+ "job.str_loki_url " + "FROM " + "job," + "folder," + "show," + "facility,"
+ "job_stat," + "job_resource, " + "job_mem, " + "job_usage " + "WHERE "
+ "job.pk_show = show.pk_show " + "AND " + "job.pk_folder = folder.pk_folder " + "AND "
+ "job.pk_facility = facility.pk_facility " + "AND " + "job.pk_job = job_stat.pk_job "
+ "AND " + "job.pk_job = job_resource.pk_job " + "AND " + "job.pk_job = job_mem.pk_job "
+ "AND " + "job.pk_job = job_usage.pk_job ";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ public RunFrame prepareRqdRunFrame(VirtualProc proc, DispatchFrame frame) {
.setLayerId(frame.getLayerId()).setResourceId(proc.getProcId())
.setNumCores(proc.coresReserved).setNumGpus(proc.gpusReserved)
.setStartTime(System.currentTimeMillis()).setIgnoreNimby(proc.isLocalDispatch)
.setOs(proc.os).setSoftMemoryLimit(frame.softMemoryLimit)
.setOs(proc.os).setSoftMemoryLimit(frame.softMemoryLimit).setLokiUrl(frame.lokiURL)
.setHardMemoryLimit(frame.hardMemoryLimit)
.putAllEnvironment(jobDao.getEnvironment(frame))
.putAllEnvironment(layerDao.getLayerEnvironment(frame)).putEnvironment("CUE3", "1")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,8 @@ public String getJobLogRootDir(String os) {
return env.getRequiredProperty("log.frame-log-root.default_os", String.class);
}
}

public String getLokiURL() {
return env.getRequiredProperty("log.loki.url", String.class);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
alter table job
add str_loki_url varchar(256);
9 changes: 9 additions & 0 deletions cuebot/src/main/resources/opencue.properties
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ log.frame-log-root.default_os=${CUE_FRAME_LOG_DIR:/shots}
# are planning to use a folder in the root, use:
# - log.frame-log-root.Windows=${S:}

# Loki
# To enable rqd frame logs to Loki configure the url as shown below. When Loki
# rqd frame logs are enable all frame logs will be streamed to the loki server instead of using the
# CUE_FRAME_LOG_DIR filesystem path. Refer to the documentation on how to configure Loki.

# This is the base url of the Loki server. If the url is not reachable, rqd will fail running frames. If it is empty
# it will not use the Loki backend for logging
log.loki.url=

# Maximum number of jobs to query.
dispatcher.job_query_max=20
# Number of seconds before waiting to book the same job from a different host.
Expand Down
5 changes: 4 additions & 1 deletion cuebot/src/test/resources/opencue.properties
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,7 @@ dispatcher.memory.mem_reserved_min = 262144
dispatcher.memory.mem_reserved_system = 524288
dispatcher.memory.mem_gpu_reserved_default = 0
dispatcher.memory.mem_gpu_reserved_min = 0
dispatcher.memory.mem_gpu_reserved_max = 104857600
dispatcher.memory.mem_gpu_reserved_max = 104857600

# Loki
log.loki.url = http://localhost/loki/api
1 change: 1 addition & 0 deletions cuegui/cuegui/App.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class CueGuiApplication(QtWidgets.QApplication):

# Global signals
display_log_file_content = QtCore.Signal(object)
select_frame = QtCore.Signal(object, object)
double_click = QtCore.Signal(object)
facility_changed = QtCore.Signal()
single_click = QtCore.Signal(object)
Expand Down
1 change: 1 addition & 0 deletions cuegui/cuegui/FrameMonitorTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,7 @@ def __itemSingleClickedViewLog(self, item, col):
old_log_files = []

self.app.display_log_file_content.emit([current_log_file] + old_log_files)
self.app.select_frame.emit(self.__job, item.rpcObject)

def __itemDoubleClickedViewLog(self, item, col):
"""Called when a frame is double clicked, views the frame log in a popup
Expand Down
Loading

0 comments on commit 81b0fe1

Please sign in to comment.