-
Notifications
You must be signed in to change notification settings - Fork 2
/
Real_time_object-recognising_voice_assistant.py
72 lines (56 loc) · 3.02 KB
/
Real_time_object-recognising_voice_assistant.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#import all the essentials
import cv2
import numpy as np
import time,os
from gtts import gTTS
from playsound import playsound
#load YOLO
net=cv2.dnn.readNet("yolov3.weights","yolov3.cfg")# can add path also
#loading coco.names file, which contains names of objects it can detect
with open("coco.names",'r') as f:
classes=[line.strip() for line in f]
layer_names=net.getLayerNames()
output_layers=[layer_names[i[0]-1] for i in net.getUnconnectedOutLayers()]
cap=cv2.VideoCapture(0) #you can give 1 or 2 for extra connected webcams
frame_no=0
inc=0
#Looping creates N_images to look like video
while True:
start_time=time.time() #starting time counting to measure frames processing speed
_,frame=cap.read() #reading from webcam
frame_no+=1
class_ids=[]
confidences=[]
detect_obj=0
height,width = frame.shape[:2] #gives dimensions of current frame
blob=cv2.dnn.blobFromImage(frame,0.00392,(416,416),(0,0,0),True,crop=False) #detects blob(group of identicals) within the frame
net.setInput(blob)
outputs = net.forward(output_layers)
for out in outputs:
for i in out: # the 'i' in 'out' is a list of 85 numbers. Real sccores are from index 5 to 85 whose value are between 0 and 1
scores = i[5:] # cuts 85 to 80 required numbers
class_id = np.argmax(scores) #gives index of max valued number in above list of 80 numbers
confidence=scores[class_id] #gives highest score in list of 80 values which are between 0 and 1
if confidence>0.6:
#object detected
class_ids.append(class_id) #all objects and their respective confidences of all blobs are stored as a list
confidences.append(float(confidence))
for i in range(len(class_ids)):
conf=confidences[i]
label = classes[class_ids[i]] #'label' variable holds name of object detected
print(label, conf*100) #prints object detected with how confident it is in its predition
voice=str(label)+"in front of you" #string being passed to convert to voice with gtts
file_path='voice{}.mp3'.format(inc) #u can specify path to temporarily store text to voice conversion
inc+=1
sound=gTTS(text=voice,lang='en') #text to voice conversion with gtts
sound.save(file_path) #voice file saving in specified path
if class_ids: #if any object is detected it says the name else says no 'no object detected'
playsound(file_path)
else:
playsound('no_obj.mp3') #create an mp3 file saying 'no object detected' refer README
os.remove(file_path) #removes the voice file saved
end_time=time.time() #stoping time counter after all processing is done
elapsed=end_time-start_time
print(1/elapsed) #gives the number of frames processed per second
cap.release()
cv2.destroyAllWindows()