Linux Audio

Check our new training course

Loading...
Note: File does not exist in v3.1.
  1/*
  2 * Intel MIC Platform Software Stack (MPSS)
  3 *
  4 * Copyright(c) 2015 Intel Corporation.
  5 *
  6 * This program is free software; you can redistribute it and/or modify
  7 * it under the terms of the GNU General Public License, version 2, as
  8 * published by the Free Software Foundation.
  9 *
 10 * This program is distributed in the hope that it will be useful, but
 11 * WITHOUT ANY WARRANTY; without even the implied warranty of
 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 13 * General Public License for more details.
 14 *
 15 * The full GNU General Public License is included in this distribution in
 16 * the file called "COPYING".
 17 *
 18 * Intel MIC Coprocessor State Management (COSM) Driver
 19 *
 20 */
 21#include <linux/kthread.h>
 22#include "cosm_main.h"
 23
 24/*
 25 * The COSM driver uses SCIF to communicate between the management node and the
 26 * MIC cards. SCIF is used to (a) Send a shutdown command to the card (b)
 27 * receive a shutdown status back from the card upon completion of shutdown and
 28 * (c) receive periodic heartbeat messages from the card used to deduce if the
 29 * card has crashed.
 30 *
 31 * A COSM server consisting of a SCIF listening endpoint waits for incoming
 32 * connections from the card. Upon acceptance of the connection, a separate
 33 * work-item is scheduled to handle SCIF message processing for that card. The
 34 * life-time of this work-item is therefore the time from which the connection
 35 * from a card is accepted to the time at which the connection is closed. A new
 36 * work-item starts each time the card boots and is alive till the card (a)
 37 * shuts down (b) is reset (c) crashes (d) cosm_client driver on the card is
 38 * unloaded.
 39 *
 40 * From the point of view of COSM interactions with SCIF during card
 41 * shutdown, reset and crash are as follows:
 42 *
 43 * Card shutdown
 44 * -------------
 45 * 1. COSM client on the card invokes orderly_poweroff() in response to SHUTDOWN
 46 *    message from the host.
 47 * 2. Card driver shutdown callback invokes scif_unregister_device(..) resulting
 48 *    in scif_remove(..) getting called on the card
 49 * 3. scif_remove -> scif_stop -> scif_handle_remove_node ->
 50 *    scif_peer_unregister_device -> device_unregister for the host peer device
 51 * 4. During device_unregister remove(..) method of cosm_client is invoked which
 52 *    closes the COSM SCIF endpoint on the card. This results in a SCIF_DISCNCT
 53 *    message being sent to host SCIF. SCIF_DISCNCT message processing on the
 54 *    host SCIF sets the host COSM SCIF endpoint state to DISCONNECTED and wakes
 55 *    up the host COSM thread blocked in scif_poll(..) resulting in
 56 *    scif_poll(..)  returning POLLHUP.
 57 * 5. On the card, scif_peer_release_dev is next called which results in an
 58 *    SCIF_EXIT message being sent to the host and after receiving the
 59 *    SCIF_EXIT_ACK from the host the peer device teardown on the card is
 60 *    complete.
 61 * 6. As part of the SCIF_EXIT message processing on the host, host sends a
 62 *    SCIF_REMOVE_NODE to itself corresponding to the card being removed. This
 63 *    starts a similar SCIF peer device teardown sequence on the host
 64 *    corresponding to the card being shut down.
 65 *
 66 * Card reset
 67 * ----------
 68 * The case of interest here is when the card has not been previously shut down
 69 * since most of the steps below are skipped in that case:
 70
 71 * 1. cosm_stop(..) invokes hw_ops->stop(..) method of the base PCIe driver
 72 *    which unregisters the SCIF HW device resulting in scif_remove(..) being
 73 *    called on the host.
 74 * 2. scif_remove(..) calls scif_disconnect_node(..) which results in a
 75 *    SCIF_EXIT message being sent to the card.
 76 * 3. The card executes scif_stop() as part of SCIF_EXIT message
 77 *    processing. This results in the COSM endpoint on the card being closed and
 78 *    the SCIF host peer device on the card getting unregistered similar to
 79 *    steps 3, 4 and 5 for the card shutdown case above. scif_poll(..) on the
 80 *    host returns POLLHUP as a result.
 81 * 4. On the host, card peer device unregister and SCIF HW remove(..) also
 82 *    subsequently complete.
 83 *
 84 * Card crash
 85 * ----------
 86 * If a reset is issued after the card has crashed, there is no SCIF_DISCNT
 87 * message from the card which would result in scif_poll(..) returning
 88 * POLLHUP. In this case when the host SCIF driver sends a SCIF_REMOVE_NODE
 89 * message to itself resulting in the card SCIF peer device being unregistered,
 90 * this results in a scif_peer_release_dev -> scif_cleanup_scifdev->
 91 * scif_invalidate_ep call sequence which sets the endpoint state to
 92 * DISCONNECTED and results in scif_poll(..) returning POLLHUP.
 93 */
 94
 95#define COSM_SCIF_BACKLOG 16
 96#define COSM_HEARTBEAT_CHECK_DELTA_SEC 10
 97#define COSM_HEARTBEAT_TIMEOUT_SEC \
 98		(COSM_HEARTBEAT_SEND_SEC + COSM_HEARTBEAT_CHECK_DELTA_SEC)
 99#define COSM_HEARTBEAT_TIMEOUT_MSEC (COSM_HEARTBEAT_TIMEOUT_SEC * MSEC_PER_SEC)
100
101static struct task_struct *server_thread;
102static scif_epd_t listen_epd;
103
104/* Publish MIC card's shutdown status to user space MIC daemon */
105static void cosm_update_mic_status(struct cosm_device *cdev)
106{
107	if (cdev->shutdown_status_int != MIC_NOP) {
108		cosm_set_shutdown_status(cdev, cdev->shutdown_status_int);
109		cdev->shutdown_status_int = MIC_NOP;
110	}
111}
112
113/* Store MIC card's shutdown status internally when it is received */
114static void cosm_shutdown_status_int(struct cosm_device *cdev,
115				     enum mic_status shutdown_status)
116{
117	switch (shutdown_status) {
118	case MIC_HALTED:
119	case MIC_POWER_OFF:
120	case MIC_RESTART:
121	case MIC_CRASHED:
122		break;
123	default:
124		dev_err(&cdev->dev, "%s %d Unexpected shutdown_status %d\n",
125			__func__, __LINE__, shutdown_status);
126		return;
127	};
128	cdev->shutdown_status_int = shutdown_status;
129	cdev->heartbeat_watchdog_enable = false;
130
131	if (cdev->state != MIC_SHUTTING_DOWN)
132		cosm_set_state(cdev, MIC_SHUTTING_DOWN);
133}
134
135/* Non-blocking recv. Read and process all available messages */
136static void cosm_scif_recv(struct cosm_device *cdev)
137{
138	struct cosm_msg msg;
139	int rc;
140
141	while (1) {
142		rc = scif_recv(cdev->epd, &msg, sizeof(msg), 0);
143		if (!rc) {
144			break;
145		} else if (rc < 0) {
146			dev_dbg(&cdev->dev, "%s: %d rc %d\n",
147				__func__, __LINE__, rc);
148			break;
149		}
150		dev_dbg(&cdev->dev, "%s: %d rc %d id 0x%llx\n",
151			__func__, __LINE__, rc, msg.id);
152
153		switch (msg.id) {
154		case COSM_MSG_SHUTDOWN_STATUS:
155			cosm_shutdown_status_int(cdev, msg.shutdown_status);
156			break;
157		case COSM_MSG_HEARTBEAT:
158			/* Nothing to do, heartbeat only unblocks scif_poll */
159			break;
160		default:
161			dev_err(&cdev->dev, "%s: %d unknown msg.id %lld\n",
162				__func__, __LINE__, msg.id);
163			break;
164		}
165	}
166}
167
168/* Publish crashed status for this MIC card */
169static void cosm_set_crashed(struct cosm_device *cdev)
170{
171	dev_err(&cdev->dev, "node alive timeout\n");
172	cosm_shutdown_status_int(cdev, MIC_CRASHED);
173	cosm_update_mic_status(cdev);
174}
175
176/* Send host time to the MIC card to sync system time between host and MIC */
177static void cosm_send_time(struct cosm_device *cdev)
178{
179	struct cosm_msg msg = { .id = COSM_MSG_SYNC_TIME };
180	int rc;
181
182	getnstimeofday64(&msg.timespec);
183	rc = scif_send(cdev->epd, &msg, sizeof(msg), SCIF_SEND_BLOCK);
184	if (rc < 0)
185		dev_err(&cdev->dev, "%s %d scif_send failed rc %d\n",
186			__func__, __LINE__, rc);
187}
188
189/*
190 * Close this cosm_device's endpoint after its peer endpoint on the card has
191 * been closed. In all cases except MIC card crash POLLHUP on the host is
192 * triggered by the client's endpoint being closed.
193 */
194static void cosm_scif_close(struct cosm_device *cdev)
195{
196	/*
197	 * Because SHUTDOWN_STATUS message is sent by the MIC cards in the
198	 * reboot notifier when shutdown is still not complete, we notify mpssd
199	 * to reset the card when SCIF endpoint is closed.
200	 */
201	cosm_update_mic_status(cdev);
202	scif_close(cdev->epd);
203	cdev->epd = NULL;
204	dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
205}
206
207/*
208 * Set card state to ONLINE when a new SCIF connection from a MIC card is
209 * received. Normally the state is BOOTING when the connection comes in, but can
210 * be ONLINE if cosm_client driver on the card was unloaded and then reloaded.
211 */
212static int cosm_set_online(struct cosm_device *cdev)
213{
214	int rc = 0;
215
216	if (MIC_BOOTING == cdev->state || MIC_ONLINE == cdev->state) {
217		cdev->heartbeat_watchdog_enable = cdev->sysfs_heartbeat_enable;
218		cdev->epd = cdev->newepd;
219		if (cdev->state == MIC_BOOTING)
220			cosm_set_state(cdev, MIC_ONLINE);
221		cosm_send_time(cdev);
222		dev_dbg(&cdev->dev, "%s %d\n", __func__, __LINE__);
223	} else {
224		dev_warn(&cdev->dev, "%s %d not going online in state: %s\n",
225			 __func__, __LINE__, cosm_state_string[cdev->state]);
226		rc = -EINVAL;
227	}
228	/* Drop reference acquired by bus_find_device in the server thread */
229	put_device(&cdev->dev);
230	return rc;
231}
232
233/*
234 * Work function for handling work for a SCIF connection from a particular MIC
235 * card. It first sets the card state to ONLINE and then calls scif_poll to
236 * block on activity such as incoming messages on the SCIF endpoint. When the
237 * endpoint is closed, the work function exits, completing its life cycle, from
238 * MIC card boot to card shutdown/reset/crash.
239 */
240void cosm_scif_work(struct work_struct *work)
241{
242	struct cosm_device *cdev = container_of(work, struct cosm_device,
243						scif_work);
244	struct scif_pollepd pollepd;
245	int rc;
246
247	mutex_lock(&cdev->cosm_mutex);
248	if (cosm_set_online(cdev))
249		goto exit;
250
251	while (1) {
252		pollepd.epd = cdev->epd;
253		pollepd.events = POLLIN;
254
255		/* Drop the mutex before blocking in scif_poll(..) */
256		mutex_unlock(&cdev->cosm_mutex);
257		/* poll(..) with timeout on our endpoint */
258		rc = scif_poll(&pollepd, 1, COSM_HEARTBEAT_TIMEOUT_MSEC);
259		mutex_lock(&cdev->cosm_mutex);
260		if (rc < 0) {
261			dev_err(&cdev->dev, "%s %d scif_poll rc %d\n",
262				__func__, __LINE__, rc);
263			continue;
264		}
265
266		/* There is a message from the card */
267		if (pollepd.revents & POLLIN)
268			cosm_scif_recv(cdev);
269
270		/* The peer endpoint is closed or this endpoint disconnected */
271		if (pollepd.revents & POLLHUP) {
272			cosm_scif_close(cdev);
273			break;
274		}
275
276		/* Did we timeout from poll? */
277		if (!rc && cdev->heartbeat_watchdog_enable)
278			cosm_set_crashed(cdev);
279	}
280exit:
281	dev_dbg(&cdev->dev, "%s %d exiting\n", __func__, __LINE__);
282	mutex_unlock(&cdev->cosm_mutex);
283}
284
285/*
286 * COSM SCIF server thread function. Accepts incoming SCIF connections from MIC
287 * cards, finds the correct cosm_device to associate that connection with and
288 * schedules individual work items for each MIC card.
289 */
290static int cosm_scif_server(void *unused)
291{
292	struct cosm_device *cdev;
293	scif_epd_t newepd;
294	struct scif_port_id port_id;
295	int rc;
296
297	allow_signal(SIGKILL);
298
299	while (!kthread_should_stop()) {
300		rc = scif_accept(listen_epd, &port_id, &newepd,
301				 SCIF_ACCEPT_SYNC);
302		if (rc < 0) {
303			if (-ERESTARTSYS != rc)
304				pr_err("%s %d rc %d\n", __func__, __LINE__, rc);
305			continue;
306		}
307
308		/*
309		 * Associate the incoming connection with a particular
310		 * cosm_device, COSM device ID == SCIF node ID - 1
311		 */
312		cdev = cosm_find_cdev_by_id(port_id.node - 1);
313		if (!cdev)
314			continue;
315		cdev->newepd = newepd;
316		schedule_work(&cdev->scif_work);
317	}
318
319	pr_debug("%s %d Server thread stopped\n", __func__, __LINE__);
320	return 0;
321}
322
323static int cosm_scif_listen(void)
324{
325	int rc;
326
327	listen_epd = scif_open();
328	if (!listen_epd) {
329		pr_err("%s %d scif_open failed\n", __func__, __LINE__);
330		return -ENOMEM;
331	}
332
333	rc = scif_bind(listen_epd, SCIF_COSM_LISTEN_PORT);
334	if (rc < 0) {
335		pr_err("%s %d scif_bind failed rc %d\n",
336		       __func__, __LINE__, rc);
337		goto err;
338	}
339
340	rc = scif_listen(listen_epd, COSM_SCIF_BACKLOG);
341	if (rc < 0) {
342		pr_err("%s %d scif_listen rc %d\n", __func__, __LINE__, rc);
343		goto err;
344	}
345	pr_debug("%s %d listen_epd set up\n", __func__, __LINE__);
346	return 0;
347err:
348	scif_close(listen_epd);
349	listen_epd = NULL;
350	return rc;
351}
352
353static void cosm_scif_listen_exit(void)
354{
355	pr_debug("%s %d closing listen_epd\n", __func__, __LINE__);
356	if (listen_epd) {
357		scif_close(listen_epd);
358		listen_epd = NULL;
359	}
360}
361
362/*
363 * Create a listening SCIF endpoint and a server kthread which accepts incoming
364 * SCIF connections from MIC cards
365 */
366int cosm_scif_init(void)
367{
368	int rc = cosm_scif_listen();
369
370	if (rc) {
371		pr_err("%s %d cosm_scif_listen rc %d\n",
372		       __func__, __LINE__, rc);
373		goto err;
374	}
375
376	server_thread = kthread_run(cosm_scif_server, NULL, "cosm_server");
377	if (IS_ERR(server_thread)) {
378		rc = PTR_ERR(server_thread);
379		pr_err("%s %d kthread_run rc %d\n", __func__, __LINE__, rc);
380		goto listen_exit;
381	}
382	return 0;
383listen_exit:
384	cosm_scif_listen_exit();
385err:
386	return rc;
387}
388
389/* Stop the running server thread and close the listening SCIF endpoint */
390void cosm_scif_exit(void)
391{
392	int rc;
393
394	if (!IS_ERR_OR_NULL(server_thread)) {
395		rc = send_sig(SIGKILL, server_thread, 0);
396		if (rc) {
397			pr_err("%s %d send_sig rc %d\n",
398			       __func__, __LINE__, rc);
399			return;
400		}
401		kthread_stop(server_thread);
402	}
403
404	cosm_scif_listen_exit();
405}