linux_sysfs.c revision 49f872b5
1/*
2 * (C) Copyright IBM Corporation 2006
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22 * DEALINGS IN THE SOFTWARE.
23 */
24
25/**
26 * \file linux_sysfs.c
27 * Access PCI subsystem using Linux's sysfs interface.  This interface is
28 * available starting somewhere in the late 2.5.x kernel phase, and is the
29 * preferred method on all 2.6.x kernels.
30 *
31 * \author Ian Romanick <idr@us.ibm.com>
32 */
33
34#define _GNU_SOURCE
35
36#include <stdlib.h>
37#include <string.h>
38#include <stdio.h>
39#include <unistd.h>
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <fcntl.h>
43#include <sys/mman.h>
44#include <dirent.h>
45#include <errno.h>
46
47#include "config.h"
48
49#ifdef HAVE_MTRR
50#include <asm/mtrr.h>
51#include <sys/ioctl.h>
52#endif
53
54#include "pciaccess.h"
55#include "pciaccess_private.h"
56#include "linux_devmem.h"
57
58static void pci_device_linux_sysfs_enable(struct pci_device *dev);
59
60static int pci_device_linux_sysfs_read_rom( struct pci_device * dev,
61    void * buffer );
62
63static int pci_device_linux_sysfs_probe( struct pci_device * dev );
64
65static int pci_device_linux_sysfs_map_range(struct pci_device *dev,
66    struct pci_device_mapping *map);
67
68static int pci_device_linux_sysfs_unmap_range(struct pci_device *dev,
69    struct pci_device_mapping *map);
70
71static int pci_device_linux_sysfs_read( struct pci_device * dev, void * data,
72    pciaddr_t offset, pciaddr_t size, pciaddr_t * bytes_read );
73
74static int pci_device_linux_sysfs_write( struct pci_device * dev,
75    const void * data, pciaddr_t offset, pciaddr_t size,
76    pciaddr_t * bytes_written );
77
78static int pci_device_linux_sysfs_boot_vga( struct pci_device * dev );
79static int pci_device_linux_sysfs_has_kernel_driver(struct pci_device *dev);
80
81static const struct pci_system_methods linux_sysfs_methods = {
82    .destroy = NULL,
83    .destroy_device = NULL,
84    .read_rom = pci_device_linux_sysfs_read_rom,
85    .probe = pci_device_linux_sysfs_probe,
86    .map_range = pci_device_linux_sysfs_map_range,
87    .unmap_range = pci_device_linux_sysfs_unmap_range,
88
89    .read = pci_device_linux_sysfs_read,
90    .write = pci_device_linux_sysfs_write,
91
92    .fill_capabilities = pci_fill_capabilities_generic,
93    .enable = pci_device_linux_sysfs_enable,
94    .boot_vga = pci_device_linux_sysfs_boot_vga,
95    .has_kernel_driver = pci_device_linux_sysfs_has_kernel_driver,
96};
97
98#define SYS_BUS_PCI "/sys/bus/pci/devices"
99
100
101static int populate_entries(struct pci_system * pci_sys);
102
103
104/**
105 * Attempt to access PCI subsystem using Linux's sysfs interface.
106 */
107_pci_hidden int
108pci_system_linux_sysfs_create( void )
109{
110    int err = 0;
111    struct stat st;
112
113
114    /* If the directory "/sys/bus/pci/devices" exists, then the PCI subsystem
115     * can be accessed using this interface.
116     */
117
118    if ( stat( SYS_BUS_PCI, & st ) == 0 ) {
119	pci_sys = calloc( 1, sizeof( struct pci_system ) );
120	if ( pci_sys != NULL ) {
121	    pci_sys->methods = & linux_sysfs_methods;
122#ifdef HAVE_MTRR
123	    pci_sys->mtrr_fd = open("/proc/mtrr", O_WRONLY);
124#endif
125	    err = populate_entries(pci_sys);
126	}
127	else {
128	    err = ENOMEM;
129	}
130    }
131    else {
132	err = errno;
133    }
134
135    return err;
136}
137
138
139/**
140 * Filter out the names "." and ".." from the scanned sysfs entries.
141 *
142 * \param d  Directory entry being processed by \c scandir.
143 *
144 * \return
145 * Zero if the entry name matches either "." or "..", non-zero otherwise.
146 *
147 * \sa scandir, populate_entries
148 */
149static int
150scan_sys_pci_filter( const struct dirent * d )
151{
152    return !((strcmp( d->d_name, "." ) == 0)
153	     || (strcmp( d->d_name, ".." ) == 0));
154}
155
156
157int
158populate_entries( struct pci_system * p )
159{
160    struct dirent ** devices;
161    int n;
162    int i;
163    int err = 0;
164
165
166    n = scandir( SYS_BUS_PCI, & devices, scan_sys_pci_filter, alphasort );
167    if ( n > 0 ) {
168	p->num_devices = n;
169	p->devices = calloc( n, sizeof( struct pci_device_private ) );
170
171	if (p->devices != NULL) {
172	    for (i = 0 ; i < n ; i++) {
173		uint8_t config[48];
174		pciaddr_t bytes;
175		unsigned dom, bus, dev, func;
176		struct pci_device_private *device =
177			(struct pci_device_private *) &p->devices[i];
178
179
180		sscanf(devices[i]->d_name, "%04x:%02x:%02x.%1u",
181		       & dom, & bus, & dev, & func);
182
183		device->base.domain = dom;
184		device->base.bus = bus;
185		device->base.dev = dev;
186		device->base.func = func;
187
188
189		err = pci_device_linux_sysfs_read(& device->base, config, 0,
190						  48, & bytes);
191		if ((bytes == 48) && !err) {
192		    device->base.vendor_id = (uint16_t)config[0]
193			+ ((uint16_t)config[1] << 8);
194		    device->base.device_id = (uint16_t)config[2]
195			+ ((uint16_t)config[3] << 8);
196		    device->base.device_class = (uint32_t)config[9]
197			+ ((uint32_t)config[10] << 8)
198			+ ((uint32_t)config[11] << 16);
199		    device->base.revision = config[8];
200		    device->base.subvendor_id = (uint16_t)config[44]
201			+ ((uint16_t)config[45] << 8);
202		    device->base.subdevice_id = (uint16_t)config[46]
203			+ ((uint16_t)config[47] << 8);
204		}
205
206		if (err) {
207		    break;
208		}
209	    }
210	}
211	else {
212	    err = ENOMEM;
213	}
214    }
215
216    for (i = 0; i < n; i++)
217	free(devices[i]);
218    free(devices);
219
220    if (err) {
221	free(p->devices);
222	p->devices = NULL;
223    }
224
225    return err;
226}
227
228
229static int
230pci_device_linux_sysfs_probe( struct pci_device * dev )
231{
232    char     name[256];
233    uint8_t  config[256];
234    char     resource[512];
235    int fd;
236    pciaddr_t bytes;
237    unsigned i;
238    int err;
239
240
241    err = pci_device_linux_sysfs_read( dev, config, 0, 256, & bytes );
242    if ( bytes >= 64 ) {
243	struct pci_device_private *priv = (struct pci_device_private *) dev;
244
245	dev->irq = config[60];
246	priv->header_type = config[14];
247
248
249	/* The PCI config registers can be used to obtain information
250	 * about the memory and I/O regions for the device.  However,
251	 * doing so requires some tricky parsing (to correctly handle
252	 * 64-bit memory regions) and requires writing to the config
253	 * registers.  Since we'd like to avoid having to deal with the
254	 * parsing issues and non-root users can write to PCI config
255	 * registers, we use a different file in the device's sysfs
256	 * directory called "resource".
257	 *
258	 * The resource file contains all of the needed information in
259	 * a format that is consistent across all platforms.  Each BAR
260	 * and the expansion ROM have a single line of data containing
261	 * 3, 64-bit hex values:  the first address in the region,
262	 * the last address in the region, and the region's flags.
263	 */
264	snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/resource",
265		  SYS_BUS_PCI,
266		  dev->domain,
267		  dev->bus,
268		  dev->dev,
269		  dev->func );
270	fd = open( name, O_RDONLY );
271	if ( fd != -1 ) {
272	    char * next;
273	    pciaddr_t  low_addr;
274	    pciaddr_t  high_addr;
275	    pciaddr_t  flags;
276
277
278	    bytes = read( fd, resource, 512 );
279	    resource[511] = '\0';
280
281	    close( fd );
282
283	    next = resource;
284	    for ( i = 0 ; i < 6 ; i++ ) {
285
286		dev->regions[i].base_addr = strtoull( next, & next, 16 );
287		high_addr = strtoull( next, & next, 16 );
288		flags = strtoull( next, & next, 16 );
289
290		if ( dev->regions[i].base_addr != 0 ) {
291		    dev->regions[i].size = (high_addr
292					    - dev->regions[i].base_addr) + 1;
293
294		    dev->regions[i].is_IO = (flags & 0x01);
295		    dev->regions[i].is_64 = (flags & 0x04);
296		    dev->regions[i].is_prefetchable = (flags & 0x08);
297		}
298	    }
299
300	    low_addr = strtoull( next, & next, 16 );
301	    high_addr = strtoull( next, & next, 16 );
302	    flags = strtoull( next, & next, 16 );
303	    if ( low_addr != 0 ) {
304		priv->rom_base = low_addr;
305		dev->rom_size = (high_addr - low_addr) + 1;
306	    }
307	}
308    }
309
310    return err;
311}
312
313
314static int
315pci_device_linux_sysfs_read_rom( struct pci_device * dev, void * buffer )
316{
317    char name[256];
318    int fd;
319    struct stat  st;
320    int err = 0;
321    size_t rom_size;
322    size_t total_bytes;
323
324
325    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/rom",
326	      SYS_BUS_PCI,
327	      dev->domain,
328	      dev->bus,
329	      dev->dev,
330	      dev->func );
331
332    fd = open( name, O_RDWR );
333    if ( fd == -1 ) {
334#ifdef LINUX_ROM
335	/* If reading the ROM using sysfs fails, fall back to the old
336	 * /dev/mem based interface.
337	 * disable this for newer kernels using configure
338	 */
339	return pci_device_linux_devmem_read_rom(dev, buffer);
340#else
341	return errno;
342#endif
343    }
344
345
346    if ( fstat( fd, & st ) == -1 ) {
347	close( fd );
348	return errno;
349    }
350
351    rom_size = st.st_size;
352    if ( rom_size == 0 )
353	rom_size = 0x10000;
354
355    /* This is a quirky thing on Linux.  Even though the ROM and the file
356     * for the ROM in sysfs are read-only, the string "1" must be written to
357     * the file to enable the ROM.  After the data has been read, "0" must be
358     * written to the file to disable the ROM.
359     */
360    write( fd, "1", 1 );
361    lseek( fd, 0, SEEK_SET );
362
363    for ( total_bytes = 0 ; total_bytes < rom_size ; /* empty */ ) {
364	const int bytes = read( fd, (char *) buffer + total_bytes,
365				rom_size - total_bytes );
366	if ( bytes == -1 ) {
367	    err = errno;
368	    break;
369	}
370	else if ( bytes == 0 ) {
371	    break;
372	}
373
374	total_bytes += bytes;
375    }
376
377
378    lseek( fd, 0, SEEK_SET );
379    write( fd, "0", 1 );
380
381    close( fd );
382    return err;
383}
384
385
386static int
387pci_device_linux_sysfs_read( struct pci_device * dev, void * data,
388			     pciaddr_t offset, pciaddr_t size,
389			     pciaddr_t * bytes_read )
390{
391    char name[256];
392    pciaddr_t temp_size = size;
393    int err = 0;
394    int fd;
395    char *data_bytes = data;
396
397    if ( bytes_read != NULL ) {
398	*bytes_read = 0;
399    }
400
401    /* Each device has a directory under sysfs.  Within that directory there
402     * is a file named "config".  This file used to access the PCI config
403     * space.  It is used here to obtain most of the information about the
404     * device.
405     */
406    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/config",
407	      SYS_BUS_PCI,
408	      dev->domain,
409	      dev->bus,
410	      dev->dev,
411	      dev->func );
412
413    fd = open( name, O_RDONLY );
414    if ( fd == -1 ) {
415	return errno;
416    }
417
418
419    while ( temp_size > 0 ) {
420	const ssize_t bytes = pread64( fd, data_bytes, temp_size, offset );
421
422	/* If zero bytes were read, then we assume it's the end of the
423	 * config file.
424	 */
425	if ( bytes <= 0 ) {
426	    err = errno;
427	    break;
428	}
429
430	temp_size -= bytes;
431	offset += bytes;
432	data_bytes += bytes;
433    }
434
435    if ( bytes_read != NULL ) {
436	*bytes_read = size - temp_size;
437    }
438
439    close( fd );
440    return err;
441}
442
443
444static int
445pci_device_linux_sysfs_write( struct pci_device * dev, const void * data,
446			     pciaddr_t offset, pciaddr_t size,
447			     pciaddr_t * bytes_written )
448{
449    char name[256];
450    pciaddr_t temp_size = size;
451    int err = 0;
452    int fd;
453    const char *data_bytes = data;
454
455    if ( bytes_written != NULL ) {
456	*bytes_written = 0;
457    }
458
459    /* Each device has a directory under sysfs.  Within that directory there
460     * is a file named "config".  This file used to access the PCI config
461     * space.  It is used here to obtain most of the information about the
462     * device.
463     */
464    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/config",
465	      SYS_BUS_PCI,
466	      dev->domain,
467	      dev->bus,
468	      dev->dev,
469	      dev->func );
470
471    fd = open( name, O_WRONLY );
472    if ( fd == -1 ) {
473	return errno;
474    }
475
476
477    while ( temp_size > 0 ) {
478	const ssize_t bytes = pwrite64( fd, data_bytes, temp_size, offset );
479
480	/* If zero bytes were written, then we assume it's the end of the
481	 * config file.
482	 */
483	if ( bytes <= 0 ) {
484	    err = errno;
485	    break;
486	}
487
488	temp_size -= bytes;
489	offset += bytes;
490	data_bytes += bytes;
491    }
492
493    if ( bytes_written != NULL ) {
494	*bytes_written = size - temp_size;
495    }
496
497    close( fd );
498    return err;
499}
500
501static int
502pci_device_linux_sysfs_map_range_wc(struct pci_device *dev,
503				    struct pci_device_mapping *map)
504{
505    char name[256];
506    int fd;
507    const int prot = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
508        ? (PROT_READ | PROT_WRITE) : PROT_READ;
509    const int open_flags = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
510        ? O_RDWR : O_RDONLY;
511    const off_t offset = map->base - dev->regions[map->region].base_addr;
512
513    snprintf(name, 255, "%s/%04x:%02x:%02x.%1u/resource%u_wc",
514	     SYS_BUS_PCI,
515	     dev->domain,
516	     dev->bus,
517	     dev->dev,
518	     dev->func,
519	     map->region);
520    fd = open(name, open_flags);
521    if (fd == -1)
522	    return errno;
523
524    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
525    if (map->memory == MAP_FAILED) {
526        map->memory = NULL;
527	close(fd);
528	return errno;
529    }
530
531    close(fd);
532
533    return 0;
534}
535
536/**
537 * Map a memory region for a device using the Linux sysfs interface.
538 *
539 * \param dev   Device whose memory region is to be mapped.
540 * \param map   Parameters of the mapping that is to be created.
541 *
542 * \return
543 * Zero on success or an \c errno value on failure.
544 *
545 * \sa pci_device_map_rrange, pci_device_linux_sysfs_unmap_range
546 *
547 * \todo
548 * Some older 2.6.x kernels don't implement the resourceN files.  On those
549 * systems /dev/mem must be used.  On these systems it is also possible that
550 * \c mmap64 may need to be used.
551 */
552static int
553pci_device_linux_sysfs_map_range(struct pci_device *dev,
554                                 struct pci_device_mapping *map)
555{
556    char name[256];
557    int fd;
558    int err = 0;
559    const int prot = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
560        ? (PROT_READ | PROT_WRITE) : PROT_READ;
561    const int open_flags = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
562        ? O_RDWR : O_RDONLY;
563    const off_t offset = map->base - dev->regions[map->region].base_addr;
564#ifdef HAVE_MTRR
565    struct mtrr_sentry sentry = {
566	.base = map->base,
567        .size = map->size,
568	.type = MTRR_TYPE_UNCACHABLE
569    };
570#endif
571
572    /* For WC mappings, try sysfs resourceN_wc file first */
573    if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) &&
574	!pci_device_linux_sysfs_map_range_wc(dev, map))
575	    return 0;
576
577    snprintf(name, 255, "%s/%04x:%02x:%02x.%1u/resource%u",
578             SYS_BUS_PCI,
579             dev->domain,
580             dev->bus,
581             dev->dev,
582             dev->func,
583             map->region);
584
585    fd = open(name, open_flags);
586    if (fd == -1) {
587        return errno;
588    }
589
590
591    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
592    if (map->memory == MAP_FAILED) {
593        map->memory = NULL;
594	close(fd);
595	return errno;
596    }
597
598#ifdef HAVE_MTRR
599    if ((map->flags & PCI_DEV_MAP_FLAG_CACHABLE) != 0) {
600        sentry.type = MTRR_TYPE_WRBACK;
601    } else if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) != 0) {
602        sentry.type = MTRR_TYPE_WRCOMB;
603    }
604
605    if (pci_sys->mtrr_fd != -1 && sentry.type != MTRR_TYPE_UNCACHABLE) {
606	if (ioctl(pci_sys->mtrr_fd, MTRRIOC_ADD_ENTRY, &sentry) < 0) {
607	    /* FIXME: Should we report an error in this case?
608	     */
609	    fprintf(stderr, "error setting MTRR "
610		    "(base = 0x%08lx, size = 0x%08x, type = %u) %s (%d)\n",
611		    sentry.base, sentry.size, sentry.type,
612		    strerror(errno), errno);
613/*            err = errno;*/
614	}
615	/* KLUDGE ALERT -- rewrite the PTEs to turn off the CD and WT bits */
616	mprotect (map->memory, map->size, PROT_NONE);
617	err = mprotect (map->memory, map->size, PROT_READ|PROT_WRITE);
618
619	if (err != 0) {
620	    fprintf(stderr, "mprotect(PROT_READ | PROT_WRITE) failed: %s\n",
621		    strerror(errno));
622	    fprintf(stderr, "remapping without mprotect performance kludge.\n");
623
624	    munmap(map->memory, map->size);
625	    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
626	    if (map->memory == MAP_FAILED) {
627		map->memory = NULL;
628		close(fd);
629		return errno;
630	    }
631	}
632    }
633#endif
634
635    close(fd);
636
637    return 0;
638}
639
640/**
641 * Unmap a memory region for a device using the Linux sysfs interface.
642 *
643 * \param dev   Device whose memory region is to be unmapped.
644 * \param map   Parameters of the mapping that is to be destroyed.
645 *
646 * \return
647 * Zero on success or an \c errno value on failure.
648 *
649 * \sa pci_device_map_rrange, pci_device_linux_sysfs_map_range
650 *
651 * \todo
652 * Some older 2.6.x kernels don't implement the resourceN files.  On those
653 * systems /dev/mem must be used.  On these systems it is also possible that
654 * \c mmap64 may need to be used.
655 */
656static int
657pci_device_linux_sysfs_unmap_range(struct pci_device *dev,
658				   struct pci_device_mapping *map)
659{
660    int err = 0;
661#ifdef HAVE_MTRR
662    struct mtrr_sentry sentry = {
663	.base = map->base,
664        .size = map->size,
665	.type = MTRR_TYPE_UNCACHABLE
666    };
667#endif
668
669    err = pci_device_generic_unmap_range (dev, map);
670    if (err)
671	return err;
672
673#ifdef HAVE_MTRR
674    if ((map->flags & PCI_DEV_MAP_FLAG_CACHABLE) != 0) {
675        sentry.type = MTRR_TYPE_WRBACK;
676    } else if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) != 0) {
677        sentry.type = MTRR_TYPE_WRCOMB;
678    }
679
680    if (pci_sys->mtrr_fd != -1 && sentry.type != MTRR_TYPE_UNCACHABLE) {
681	if (ioctl(pci_sys->mtrr_fd, MTRRIOC_DEL_ENTRY, &sentry) < 0) {
682	    /* FIXME: Should we report an error in this case?
683	     */
684	    fprintf(stderr, "error setting MTRR "
685		    "(base = 0x%08lx, size = 0x%08x, type = %u) %s (%d)\n",
686		    sentry.base, sentry.size, sentry.type,
687		    strerror(errno), errno);
688/*            err = errno;*/
689	}
690    }
691#endif
692
693    return err;
694}
695
696static void pci_device_linux_sysfs_enable(struct pci_device *dev)
697{
698    char name[256];
699    int fd;
700
701    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/enable",
702	      SYS_BUS_PCI,
703	      dev->domain,
704	      dev->bus,
705	      dev->dev,
706	      dev->func );
707
708    fd = open( name, O_RDWR );
709    if (fd == -1)
710       return;
711
712    write( fd, "1", 1 );
713    close(fd);
714}
715
716static int pci_device_linux_sysfs_boot_vga(struct pci_device *dev)
717{
718    char name[256];
719    char reply[3];
720    int fd, bytes_read;
721    int ret = 0;
722
723    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/boot_vga",
724	      SYS_BUS_PCI,
725	      dev->domain,
726	      dev->bus,
727	      dev->dev,
728	      dev->func );
729
730    fd = open( name, O_RDONLY );
731    if (fd == -1)
732       return 0;
733
734    bytes_read = read(fd, reply, 1);
735    if (bytes_read != 1)
736	goto out;
737    if (reply[0] == '1')
738	ret = 1;
739out:
740    close(fd);
741    return ret;
742}
743
744static int pci_device_linux_sysfs_has_kernel_driver(struct pci_device *dev)
745{
746    char name[256];
747    struct stat dummy;
748    int ret;
749
750    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/driver",
751	      SYS_BUS_PCI,
752	      dev->domain,
753	      dev->bus,
754	      dev->dev,
755	      dev->func );
756
757    ret = stat(name, &dummy);
758    if (ret < 0)
759	return 0;
760    return 1;
761}
762