-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathneuron_core.c
357 lines (294 loc) · 9.84 KB
/
neuron_core.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright 2020, Amazon.com, Inc. or its affiliates. All Rights Reserved
*/
/** Each neuron device has N number of neuron cores. (inf1 has 4 neuron cores).
*
* Engines:
* -------
* Neuron Core has multiple engines(inf1 has 3) which can do different types of computations.
* Each engine's instruction stream is feed through DMA.
*
* Notifications:
* -------------
* As the engines execute instructions they produce messages in notification queue.
* These messages are used by applications for monitoring completion of program and
* also for profiling the program.
*
* Notification queue is a circular buffer in host memory - hardware writes to the buffer and
* applications consumes it by memory mapping the area.
*
* Semaphores and events:
* ---------------------
* For synchronization between hardware blocks and software, NC provides two type synchronization
* hardware primitives, semaphores and events. Events can be considered simple bitmap which hold
* either 1 or 0. Semaphores hold any value in signed 32 bit range. Engines can be programmed
* with instructions which can wait for semaphore to reach a certain value or a particular event
* is set. Applications can use this to manipulate execution of the program.
*/
#include <asm/io.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/delay.h>
#include <linux/mm.h>
#include <linux/fault-inject.h>
#include "v1/address_map.h"
#include "v1/putils.h"
#include "neuron_mempool.h"
#include "neuron_device.h"
#ifdef CONFIG_FAULT_INJECTION
DECLARE_FAULT_ATTR(neuron_fail_nc_mmap);
#endif
#define NC_SEMAPHORE_SIZE 4
#define NC_EVENT_SIZE 4
static u64 nc_get_axi_offset(int nc_index)
{
return MMAP_P_OFFSET + (nc_index * MMAP_NC_SIZE);
}
static void *nc_get_semaphore_base(struct neuron_device *nd, u8 nc_id)
{
return nd->npdev.bar2 + nc_get_axi_offset(nc_id);
}
int nc_semaphore_read(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 *result)
{
void *addr;
if (semaphore_index > V1_SEMAPHORE_COUNT)
return -EINVAL;
addr = nc_get_semaphore_base(nd, nc_id);
addr += MMAP_NC_SEMA_READ_OFFSET + (semaphore_index * NC_SEMAPHORE_SIZE);
return fw_io_read_csr_array((void **)&addr, result, 1);
}
int nc_semaphore_write(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value)
{
void *addr;
if (semaphore_index > V1_SEMAPHORE_COUNT)
return -EINVAL;
addr = nc_get_semaphore_base(nd, nc_id);
addr += MMAP_NC_SEMA_SET_OFFSET + (semaphore_index * NC_SEMAPHORE_SIZE);
writel(value, addr);
return 0;
}
int nc_semaphore_increment(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value)
{
void *addr;
if (semaphore_index > V1_SEMAPHORE_COUNT)
return -EINVAL;
addr = nc_get_semaphore_base(nd, nc_id);
addr += MMAP_NC_SEMA_INCR_OFFSET + (semaphore_index * NC_SEMAPHORE_SIZE);
writel(value, addr);
return 0;
}
int nc_semaphore_decrement(struct neuron_device *nd, u8 nc_id, u16 semaphore_index, u32 value)
{
void *addr;
if (semaphore_index > V1_SEMAPHORE_COUNT)
return -EINVAL;
addr = nc_get_semaphore_base(nd, nc_id);
addr += MMAP_NC_SEMA_DECR_OFFSET + (semaphore_index * NC_SEMAPHORE_SIZE);
writel(value, addr);
return 0;
}
static void *nc_get_event_addr(struct neuron_device *nd, u8 nc_id, u16 event_index)
{
void *base = nd->npdev.bar2 + nc_get_axi_offset(nc_id) + MMAP_NC_EVENT_OFFSET;
return (base + (event_index * NC_EVENT_SIZE));
}
int nc_event_get(struct neuron_device *nd, u8 nc_id, u16 event_index, u32 *result)
{
void *addr;
if (event_index > V1_EVENTS_COUNT)
return -EINVAL;
addr = nc_get_event_addr(nd, nc_id, event_index);
return fw_io_read_csr_array(&addr, result, 1);
}
int nc_event_set(struct neuron_device *nd, u8 nc_id, u16 event_index, u32 value)
{
u32 *addr;
if (event_index > V1_EVENTS_COUNT)
return -EINVAL;
addr = nc_get_event_addr(nd, nc_id, event_index);
writel(value, addr);
return 0;
}
enum NQ_TYPE {
NQ_TYPE_TRACE = 0, /**< Implicit notifications generated during execution. */
NQ_TYPE_NOTIFY, /**< Explicit notifications generated by NOTIFY instruction */
NQ_TYPE_EVENT, /**< Notifications triggered by event set/clear operations. */
NQ_TYPE_ERROR, /**< Notifications triggered by an error condition. */
NQ_TYPE_MAX
};
/* Neuron notification queues can be memory mapped to read notifications from the device.
*
* Each device has 64(V1_NC_PER_DEVICE * MAX_NQ_ENGINE * NQ_TYPE_PER_ENGINE) notification queues.
* Each queue is mapped to 1GB(separate vma) of space.
*/
/** Max size of a notification queue mapping.
*/
#define NC_NQ_MMAP_SIZE_PER_NQ (1 * 1024 * 1024 * 1024UL)
#define NC_NQ_MMAP_SIZE_PER_ENGINE (NC_NQ_MMAP_SIZE_PER_NQ * NQ_TYPE_PER_ENGINE)
#define NC_NQ_MMAP_SIZE_PER_NC (NC_NQ_MMAP_SIZE_PER_ENGINE * MAX_NQ_ENGINE)
#define NC_NQ_MMAP_SIZE_PER_ND (NC_NQ_MMAP_SIZE_PER_NC * V1_NC_PER_DEVICE)
/* offset in the devnode file */
#define NC_NQ_MMAP_START_OFFSET (0)
#define NC_NQ_MMAP_END_OFFSET (NC_NQ_MMAP_START_OFFSET + NC_NQ_MMAP_SIZE_PER_ND)
int nc_get_nq_mmap_offset(int nc_id, int engine_index, int nq_type, u64 *offset)
{
if (nc_id > V1_NC_PER_DEVICE)
return -EINVAL;
if (engine_index > MAX_NQ_ENGINE)
return -EINVAL;
if (nq_type > NQ_TYPE_PER_ENGINE)
return -EINVAL;
*offset = NC_NQ_MMAP_START_OFFSET;
*offset += (nc_id * NC_NQ_MMAP_SIZE_PER_NC);
*offset += (engine_index * NC_NQ_MMAP_SIZE_PER_ENGINE);
*offset += (nq_type * NC_NQ_MMAP_SIZE_PER_NQ);
return 0;
}
int nc_get_nq_from_mmap_offset(u64 offset, int *nc_id, int *engine_index, int *nq_type)
{
if (offset < NC_NQ_MMAP_START_OFFSET)
return -EINVAL;
if (offset >= NC_NQ_MMAP_END_OFFSET)
return -EINVAL;
offset -= NC_NQ_MMAP_START_OFFSET;
*nc_id = offset / NC_NQ_MMAP_SIZE_PER_NC;
offset %= NC_NQ_MMAP_SIZE_PER_NC;
*engine_index = offset / NC_NQ_MMAP_SIZE_PER_ENGINE;
offset %= NC_NQ_MMAP_SIZE_PER_ENGINE;
*nq_type = offset / NC_NQ_MMAP_SIZE_PER_NQ;
return 0;
}
int nc_nq_init(struct neuron_device *nd, u8 nc_id, u8 eng_index, u32 nq_type, u32 size)
{
struct mem_chunk *mc, **mc_ptr;
u64 queue_pa;
void *apb_base;
int ret;
u8 nq_id;
u32 low, high;
if (nd == NULL || nc_id >= V1_NC_PER_DEVICE)
return -EINVAL;
nq_id = (nq_type * NQ_TYPE_PER_ENGINE) + eng_index;
if (nq_id >= MAX_NQ_SUPPORTED)
return -EINVAL;
mc_ptr = &nd->nq_mc[nc_id][nq_id];
if (*mc_ptr == NULL) {
ret = mc_alloc(&nd->mpset, mc_ptr, size, MEM_LOC_HOST, 0, 0, nc_id);
if (ret)
return ret;
}
mc = *mc_ptr;
apb_base = nd->npdev.bar0 + pu_get_relative_offset(nc_id);
queue_pa = mc->pa | PCIEX8_0_BASE;
low = (u32)(queue_pa & 0xffffffff);
high = (u32)(queue_pa >> 32U);
switch (nq_type) {
case NQ_TYPE_ERROR:
pu_write_error_notification_cfg_0(apb_base, low);
pu_write_error_notification_cfg_1(apb_base, high);
pu_write_error_notification_cfg_2(apb_base, size);
break;
case NQ_TYPE_EVENT:
pu_write_event_notification_cfg_0(apb_base, low);
pu_write_event_notification_cfg_1(apb_base, high);
pu_write_event_notification_cfg_2(apb_base, size);
break;
case NQ_TYPE_NOTIFY:
pu_write_expl_notification_cfg_0(apb_base, eng_index, 0, low);
pu_write_expl_notification_cfg_1(apb_base, eng_index, 0, high);
pu_write_expl_notification_cfg_2(apb_base, eng_index, 0, size);
break;
case NQ_TYPE_TRACE:
pu_write_impl_notification_cfg_0(apb_base, eng_index, 0, low);
pu_write_impl_notification_cfg_1(apb_base, eng_index, 0, high);
pu_write_impl_notification_cfg_2(apb_base, eng_index, 0, size);
break;
default:
return -1;
}
return 0;
}
int nc_nq_destroy(struct neuron_device *nd, u8 nc_id, u8 eng_index, u32 nq_type)
{
u8 nq_id;
void *apb_base;
nq_id = (nq_type * NQ_TYPE_PER_ENGINE) + eng_index;
if (nd == NULL || nc_id >= V1_NC_PER_DEVICE || nq_id >= MAX_NQ_SUPPORTED)
return -EINVAL;
if (nd == NULL || nc_id >= V1_NC_PER_DEVICE)
return -EINVAL;
nq_id = (nq_type * NQ_TYPE_PER_ENGINE) + eng_index;
if (nq_id >= MAX_NQ_SUPPORTED)
return -EINVAL;
if (nd->nq_mc[nc_id][nq_id] == NULL) {
return 0;
}
apb_base = nd->npdev.bar0 + pu_get_relative_offset(nc_id);
switch (nq_type) {
case NQ_TYPE_ERROR:
pu_write_error_notification_cfg_2(apb_base, 0);
pu_write_error_notification_cfg_0(apb_base, 0);
pu_write_error_notification_cfg_1(apb_base, 0);
break;
case NQ_TYPE_EVENT:
pu_write_event_notification_cfg_2(apb_base, 0);
pu_write_event_notification_cfg_0(apb_base, 0);
pu_write_event_notification_cfg_1(apb_base, 0);
break;
case NQ_TYPE_NOTIFY:
pu_write_expl_notification_cfg_2(apb_base, eng_index, 0, 0);
pu_write_expl_notification_cfg_0(apb_base, eng_index, 0, 0);
pu_write_expl_notification_cfg_1(apb_base, eng_index, 0, 0);
break;
case NQ_TYPE_TRACE:
pu_write_impl_notification_cfg_2(apb_base, eng_index, 0, 0);
pu_write_impl_notification_cfg_0(apb_base, eng_index, 0, 0);
pu_write_impl_notification_cfg_1(apb_base, eng_index, 0, 0);
break;
default:
return -1;
}
// sleep 1msec so that hw can drain
msleep(1);
mc_free(&nd->nq_mc[nc_id][nq_id]);
return 0;
}
void nc_nq_destroy_all(struct neuron_device *nd)
{
u8 nc_id;
u8 eng_index;
u8 nq_type;
for (nc_id = 0; nc_id < V1_NC_PER_DEVICE; nc_id++) {
for (eng_index = 0; eng_index < MAX_NQ_ENGINE; eng_index++) {
for (nq_type = 0; nq_type < NQ_TYPE_PER_ENGINE; nq_type++) {
nc_nq_destroy(nd, nc_id, eng_index, nq_type);
}
}
}
}
int nc_nq_mmap(struct neuron_device *nd, u8 nc_id, u8 eng_index, u32 nq_type,
struct vm_area_struct *vma)
{
struct mem_chunk *mc;
u8 nq_id;
int ret;
if (nd == NULL || nc_id >= V1_NC_PER_DEVICE)
return -EINVAL;
nq_id = (nq_type * NQ_TYPE_PER_ENGINE) + eng_index;
if (nq_id >= MAX_NQ_SUPPORTED)
return -EINVAL;
mc = nd->nq_mc[nc_id][nq_id];
if (mc == NULL)
return -EINVAL;
#ifdef CONFIG_FAULT_INJECTION
if (should_fail(&neuron_fail_nc_mmap, 1))
return -ENOSPC;
#endif
ret = remap_pfn_range(vma, vma->vm_start, PHYS_PFN(mc->pa), mc->size, vma->vm_page_prot);
if (ret != 0)
return ret;
vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_DONTCOPY;
return 0;
}