Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
  1// SPDX-License-Identifier: GPL-2.0-only
  2/*
  3 * Intel MIC Platform Software Stack (MPSS)
  4 *
  5 * Copyright(c) 2015 Intel Corporation.
  6 *
  7 * Intel MIC Coprocessor State Management (COSM) Driver
  8 */
  9#include <linux/kthread.h>
 10#include <linux/sched/signal.h>
 11
 12#include "cosm_main.h"
 13
 14/*
 15 * The COSM driver uses SCIF to communicate between the management node and the
 16 * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b)
 17 * receive a shutdown status back from the card upon completion of shutdown and
 18 * (c) receive periodic heartbeat messages from the card used to deduce if the
 19 * card has crashed.
 20 *
 21 * A COSM server consisting of a SCIF listening endpoint waits for incoming
 22 * connections from the card. Upon acceptance of the connection, a separate
 23 * work-item is scheduled to handle SCIF message processing for that card. The
 24 * life-time of this work-item is therefore the time from which the connection
 25 * from a card is accepted to the time at which the connection is closed. A new
 26 * work-item starts each time the card boots and is alive till the card (a)
 27 * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is
 28 * unloaded.
 29 *
 30 * From the point of view of COSM interactions with SCIF during card
 31 * shutdown, reset and crash are as follows:
 32 *
 33 * Card shutdown
 34 * -------------
 35 * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN
 36 *    message from the host.
 37 * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting
 38 *    in scif_remove(..) getting called on the card
 39 * 3. scif_remove -> scif_stop -> scif_handle_remove_node ->
 40 *    scif_peer_unregister_device -> device_unregister for the host peer device
 41 * 4. During device_unregister remove(..) method of cosm_client is invoked which
 42 *    closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT
 43 *    message being sent to host SCIF. SCIF_DISCNCT message processing on the
 44 *    host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes
 45 *    up the host COSM thread blocked in scif_poll(..) resulting in
 46 *    scif_poll(..)  returning EPOLLHUP.
 47 * 5. On the card, scif_peer_release_dev is next called which results in an
 48 *    SCIF_EXIT message being sent to the host and after receiving the
 49 *    SCIF_EXIT_ACK from the host the peer device teardown on the card is
 50 *    complete.
 51 * 6. As part of the SCIF_EXIT message processing on the host, host sends a
 52 *    SCIF_REMOVE_NODE to itself corresponding to the card being removed. This
 53 *    starts a similar SCIF peer device teardown sequence on the host
 54 *    corresponding to the card being shut down.
 55 *
 56 * Card reset
 57 * ----------
 58 * The case of interest here is when the card has not been previously shut down
 59 * since most of the steps below are skipped in that case:
 60
 61 * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver
 62 *    which unregisters the SCIF HW device resulting in scif_remove(..) being
 63 *    called on the host.
 64 * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a
 65 *    SCIF_EXIT message being sent to the card.
 66 * 3. The card executes scif_stop() as part of SCIF_EXIT message
 67 *    processing. This results in the COSM endpoint on the card being closed and
 68 *    the SCIF host peer device on the card getting unregistered similar to
 69 *    steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the
 70 *    host returns EPOLLHUP as a result.
 71 * 4. On the host, card peer device unregister and SCIF HW remove(..) also
 72 *    subsequently complete.
 73 *
 74 * Card crash
 75 * ----------
 76 * If a reset is issued after the card has crashed, there is no SCIF_DISCNT
 77 * message from the card which would result in scif_poll(..) returning
 78 * EPOLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
 79 * message to itself resulting in the card SCIF peer device being unregistered,
 80 * this results in a scif_peer_release_dev -> scif_cleanup_scifdev->
 81 * scif_invalidate_ep call sequence which sets the endpoint state to
 82 * DISCONNECTED and results in scif_poll(..) returning EPOLLHUP.
 83 */
 84
 85#define COSM_SCIF_BACKLOG 16
 86#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10
 87#define COSM_HEARTBEAT_TIMEOUT_SEC \
 88		(COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC)
 89#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC)
 90
 91static struct task_struct *server_thread;
 92static scif_epd_t listen_epd;
 93
 94/* Publish MIC card's shutdown status to user space MIC daemon */
 95static void cosm_update_mic_status(struct cosm_device *cdev)
 96{
 97	if (cdev->shutdown_status_int != MIC_NOP) {
 98		cosm_set_shutdown_status(cdev, cdev->shutdown_status_int);
 99		cdev->shutdown_status_int = MIC_NOP;
100	}
101}
102
103/* Store MIC card's shutdown status internally when it is received */
104static void cosm_shutdown_status_int(struct cosm_device *cdev,
105				     enum mic_status shutdown_status)
106{
107	switch (shutdown_status) {
108	case MIC_HALTED:
109	case MIC_POWER_OFF:
110	case MIC_RESTART:
111	case MIC_CRASHED:
112		break;
113	default:
114		dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n",
115			__func__, __LINE__, shutdown_status);
116		return;
117	};
118	cdev->shutdown_status_int = shutdown_status;
119	cdev->heartbeat_watchdog_enable = false;
120
121	if (cdev->state != MIC_SHUTTING_DOWN)
122		cosm_set_state(cdev, MIC_SHUTTING_DOWN);
123}
124
125/* Non-blocking recv. Read and process all available messages */
126static void cosm_scif_recv(struct cosm_device *cdev)
127{
128	struct cosm_msg msg;
129	int rc;
130
131	while (1) {
132		rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0);
133		if (!rc) {
134			break;
135		} else if (rc < 0) {
136			dev_dbg(&cdev->dev, "%s: %d rc %d\n",
137				__func__, __LINE__, rc);
138			break;
139		}
140		dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n",
141			__func__, __LINE__, rc, msg.id);
142
143		switch (msg.id) {
144		case COSM_MSG_SHUTDOWN_STATUS:
145			cosm_shutdown_status_int(cdev, msg.shutdown_status);
146			break;
147		case COSM_MSG_HEARTBEAT:
148			/* Nothing to do, heartbeat only unblocks scif_poll */
149			break;
150		default:
151			dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n",
152				__func__, __LINE__, msg.id);
153			break;
154		}
155	}
156}
157
158/* Publish crashed status for this MIC card */
159static void cosm_set_crashed(struct cosm_device *cdev)
160{
161	dev_err(&cdev->dev, "node alive timeout\n");
162	cosm_shutdown_status_int(cdev, MIC_CRASHED);
163	cosm_update_mic_status(cdev);
164}
165
166/* Send host time to the MIC card to sync system time between host and MIC */
167static void cosm_send_time(struct cosm_device *cdev)
168{
169	struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME };
170	struct timespec64 ts;
171	int rc;
172
173	ktime_get_real_ts64(&ts);
174	msg.timespec.tv_sec = ts.tv_sec;
175	msg.timespec.tv_nsec = ts.tv_nsec;
176
177	rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
178	if (rc < 0)
179		dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n",
180			__func__, __LINE__, rc);
181}
182
183/*
184 * Close this cosm_device's endpoint after its peer endpoint on the card has
185 * been closed. In all cases except MIC card crash EPOLLHUP on the host is
186 * triggered by the client's endpoint being closed.
187 */
188static void cosm_scif_close(struct cosm_device *cdev)
189{
190	/*
191	 * Because SHUTDOWN_STATUS message is sent by the MIC cards in the
192	 * reboot notifier when shutdown is still not complete, we notify mpssd
193	 * to reset the card when SCIF endpoint is closed.
194	 */
195	cosm_update_mic_status(cdev);
196	scif_close(cdev->epd);
197	cdev->epd = NULL;
198	dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
199}
200
201/*
202 * Set card state to ONLINE when a new SCIF connection from a MIC card is
203 * received. Normally the state is BOOTING when the connection comes in, but can
204 * be ONLINE if cosm_client driver on the card was unloaded and then reloaded.
205 */
206static int cosm_set_online(struct cosm_device *cdev)
207{
208	int rc = 0;
209
210	if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) {
211		cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable;
212		cdev->epd = cdev->newepd;
213		if (cdev->state == MIC_BOOTING)
214			cosm_set_state(cdev, MIC_ONLINE);
215		cosm_send_time(cdev);
216		dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
217	} else {
218		dev_warn(&cdev->dev, "%s %d not going online in state: %s\n",
219			 __func__, __LINE__, cosm_state_string[cdev->state]);
220		rc = -EINVAL;
221	}
222	/* Drop reference acquired by bus_find_device in the server thread */
223	put_device(&cdev->dev);
224	return rc;
225}
226
227/*
228 * Work function for handling work for a SCIF connection from a particular MIC
229 * card. It first sets the card state to ONLINE and then calls scif_poll to
230 * block on activity such as incoming messages on the SCIF endpoint. When the
231 * endpoint is closed, the work function exits, completing its life cycle, from
232 * MIC card boot to card shutdown/reset/crash.
233 */
234void cosm_scif_work(struct work_struct *work)
235{
236	struct cosm_device *cdev = container_of(work, struct cosm_device,
237						scif_work);
238	struct scif_pollepd pollepd;
239	int rc;
240
241	mutex_lock(&cdev->cosm_mutex);
242	if (cosm_set_online(cdev))
243		goto exit;
244
245	while (1) {
246		pollepd.epd = cdev->epd;
247		pollepd.events = EPOLLIN;
248
249		/* Drop the mutex before blocking in scif_poll(..) */
250		mutex_unlock(&cdev->cosm_mutex);
251		/* poll(..) with timeout on our endpoint */
252		rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC);
253		mutex_lock(&cdev->cosm_mutex);
254		if (rc < 0) {
255			dev_err(&cdev->dev, "%s %d scif_poll rc %d\n",
256				__func__, __LINE__, rc);
257			continue;
258		}
259
260		/* There is a message from the card */
261		if (pollepd.revents & EPOLLIN)
262			cosm_scif_recv(cdev);
263
264		/* The peer endpoint is closed or this endpoint disconnected */
265		if (pollepd.revents & EPOLLHUP) {
266			cosm_scif_close(cdev);
267			break;
268		}
269
270		/* Did we timeout from poll? */
271		if (!rc && cdev->heartbeat_watchdog_enable)
272			cosm_set_crashed(cdev);
273	}
274exit:
275	dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__);
276	mutex_unlock(&cdev->cosm_mutex);
277}
278
279/*
280 * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC
281 * cards, finds the correct cosm_device to associate that connection with and
282 * schedules individual work items for each MIC card.
283 */
284static int cosm_scif_server(void *unused)
285{
286	struct cosm_device *cdev;
287	scif_epd_t newepd;
288	struct scif_port_id port_id;
289	int rc;
290
291	allow_signal(SIGKILL);
292
293	while (!kthread_should_stop()) {
294		rc = scif_accept(listen_epd, &port_id, &newepd,
295				 SCIF_ACCEPT_SYNC);
296		if (rc < 0) {
297			if (-ERESTARTSYS != rc)
298				pr_err("%s %d rc %d\n", __func__, __LINE__, rc);
299			continue;
300		}
301
302		/*
303		 * Associate the incoming connection with a particular
304		 * cosm_device, COSM device ID == SCIF node ID - 1
305		 */
306		cdev = cosm_find_cdev_by_id(port_id.node - 1);
307		if (!cdev)
308			continue;
309		cdev->newepd = newepd;
310		schedule_work(&cdev->scif_work);
311	}
312
313	pr_debug("%s %d Server thread stopped\n", __func__, __LINE__);
314	return 0;
315}
316
317static int cosm_scif_listen(void)
318{
319	int rc;
320
321	listen_epd = scif_open();
322	if (!listen_epd) {
323		pr_err("%s %d scif_open failed\n", __func__, __LINE__);
324		return -ENOMEM;
325	}
326
327	rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT);
328	if (rc < 0) {
329		pr_err("%s %d scif_bind failed rc %d\n",
330		       __func__, __LINE__, rc);
331		goto err;
332	}
333
334	rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG);
335	if (rc < 0) {
336		pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc);
337		goto err;
338	}
339	pr_debug("%s %d listen_epd set up\n", __func__, __LINE__);
340	return 0;
341err:
342	scif_close(listen_epd);
343	listen_epd = NULL;
344	return rc;
345}
346
347static void cosm_scif_listen_exit(void)
348{
349	pr_debug("%s %d closing listen_epd\n", __func__, __LINE__);
350	if (listen_epd) {
351		scif_close(listen_epd);
352		listen_epd = NULL;
353	}
354}
355
356/*
357 * Create a listening SCIF endpoint and a server kthread which accepts incoming
358 * SCIF connections from MIC cards
359 */
360int cosm_scif_init(void)
361{
362	int rc = cosm_scif_listen();
363
364	if (rc) {
365		pr_err("%s %d cosm_scif_listen rc %d\n",
366		       __func__, __LINE__, rc);
367		goto err;
368	}
369
370	server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server");
371	if (IS_ERR(server_thread)) {
372		rc = PTR_ERR(server_thread);
373		pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc);
374		goto listen_exit;
375	}
376	return 0;
377listen_exit:
378	cosm_scif_listen_exit();
379err:
380	return rc;
381}
382
383/* Stop the running server thread and close the listening SCIF endpoint */
384void cosm_scif_exit(void)
385{
386	int rc;
387
388	if (!IS_ERR_OR_NULL(server_thread)) {
389		rc = send_sig(SIGKILL, server_thread, 0);
390		if (rc) {
391			pr_err("%s %d send_sig rc %d\n",
392			       __func__, __LINE__, rc);
393			return;
394		}
395		kthread_stop(server_thread);
396	}
397
398	cosm_scif_listen_exit();
399}