linux_sysfs.c revision 6a94483f
1/*
2 * (C) Copyright IBM Corporation 2006
3 * All Rights Reserved.
4 * Copyright 2012 Red Hat, Inc.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * on the rights to use, copy, modify, merge, publish, distribute, sub
10 * license, and/or sell copies of the Software, and to permit persons to whom
11 * the Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
20 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
24 */
25
26/**
27 * \file linux_sysfs.c
28 * Access PCI subsystem using Linux's sysfs interface.  This interface is
29 * available starting somewhere in the late 2.5.x kernel phase, and is the
30 * preferred method on all 2.6.x kernels.
31 *
32 * \author Ian Romanick <idr@us.ibm.com>
33 */
34
35#define _GNU_SOURCE
36
37#ifdef HAVE_CONFIG_H
38#include "config.h"
39#endif
40
41#include <stdlib.h>
42#include <string.h>
43#include <stdio.h>
44#include <unistd.h>
45#include <sys/types.h>
46#include <sys/stat.h>
47#include <fcntl.h>
48#include <sys/mman.h>
49#include <dirent.h>
50#include <errno.h>
51
52#if defined(__i386__) || defined(__x86_64__) || defined(__arm__)
53#include <sys/io.h>
54#else
55#define inb(x) -1
56#define inw(x) -1
57#define inl(x) -1
58#define outb(x,y) do {} while (0)
59#define outw(x,y) do {} while (0)
60#define outl(x,y) do {} while (0)
61#define iopl(x) -1
62#endif
63
64#ifdef HAVE_MTRR
65#include <asm/mtrr.h>
66#include <sys/ioctl.h>
67#endif
68
69#include "pciaccess.h"
70#include "pciaccess_private.h"
71#include "linux_devmem.h"
72
73static const struct pci_system_methods linux_sysfs_methods;
74
75#define SYS_BUS_PCI "/sys/bus/pci/devices"
76
77static int
78pci_device_linux_sysfs_read( struct pci_device * dev, void * data,
79			     pciaddr_t offset, pciaddr_t size,
80			     pciaddr_t * bytes_read );
81
82static int populate_entries(struct pci_system * pci_sys);
83
84/**
85 * Attempt to access PCI subsystem using Linux's sysfs interface.
86 */
87_pci_hidden int
88pci_system_linux_sysfs_create( void )
89{
90    int err = 0;
91    struct stat st;
92
93
94    /* If the directory "/sys/bus/pci/devices" exists, then the PCI subsystem
95     * can be accessed using this interface.
96     */
97
98    if ( stat( SYS_BUS_PCI, & st ) == 0 ) {
99	pci_sys = calloc( 1, sizeof( struct pci_system ) );
100	if ( pci_sys != NULL ) {
101	    pci_sys->methods = & linux_sysfs_methods;
102#ifdef HAVE_MTRR
103	    pci_sys->mtrr_fd = open("/proc/mtrr", O_WRONLY | O_CLOEXEC);
104#endif
105	    err = populate_entries(pci_sys);
106	}
107	else {
108	    err = ENOMEM;
109	}
110    }
111    else {
112	err = errno;
113    }
114
115    return err;
116}
117
118
119/**
120 * Filter out the names "." and ".." from the scanned sysfs entries.
121 *
122 * \param d  Directory entry being processed by \c scandir.
123 *
124 * \return
125 * Zero if the entry name matches either "." or "..", non-zero otherwise.
126 *
127 * \sa scandir, populate_entries
128 */
129static int
130scan_sys_pci_filter( const struct dirent * d )
131{
132    return !((strcmp( d->d_name, "." ) == 0)
133	     || (strcmp( d->d_name, ".." ) == 0));
134}
135
136
137int
138populate_entries( struct pci_system * p )
139{
140    struct dirent ** devices = NULL;
141    int n;
142    int i;
143    int err = 0;
144
145
146    n = scandir( SYS_BUS_PCI, & devices, scan_sys_pci_filter, alphasort );
147    if ( n > 0 ) {
148	p->num_devices = n;
149	p->devices = calloc( n, sizeof( struct pci_device_private ) );
150
151	if (p->devices != NULL) {
152	    for (i = 0 ; i < n ; i++) {
153		uint8_t config[48];
154		pciaddr_t bytes;
155		unsigned dom, bus, dev, func;
156		struct pci_device_private *device =
157			(struct pci_device_private *) &p->devices[i];
158
159
160		sscanf(devices[i]->d_name, "%04x:%02x:%02x.%1u",
161		       & dom, & bus, & dev, & func);
162
163		device->base.domain = dom;
164		device->base.bus = bus;
165		device->base.dev = dev;
166		device->base.func = func;
167
168
169		err = pci_device_linux_sysfs_read(& device->base, config, 0,
170						  48, & bytes);
171		if ((bytes == 48) && !err) {
172		    device->base.vendor_id = (uint16_t)config[0]
173			+ ((uint16_t)config[1] << 8);
174		    device->base.device_id = (uint16_t)config[2]
175			+ ((uint16_t)config[3] << 8);
176		    device->base.device_class = (uint32_t)config[9]
177			+ ((uint32_t)config[10] << 8)
178			+ ((uint32_t)config[11] << 16);
179		    device->base.revision = config[8];
180		    device->base.subvendor_id = (uint16_t)config[44]
181			+ ((uint16_t)config[45] << 8);
182		    device->base.subdevice_id = (uint16_t)config[46]
183			+ ((uint16_t)config[47] << 8);
184		}
185
186		if (err) {
187		    break;
188		}
189	    }
190	}
191	else {
192	    err = ENOMEM;
193	}
194    }
195
196    for (i = 0; i < n; i++)
197	free(devices[i]);
198    free(devices);
199
200    if (err) {
201	free(p->devices);
202	p->devices = NULL;
203    }
204
205    return err;
206}
207
208
209static int
210pci_device_linux_sysfs_probe( struct pci_device * dev )
211{
212    char     name[256];
213    uint8_t  config[256];
214    char     resource[512];
215    int fd;
216    pciaddr_t bytes;
217    unsigned i;
218    int err;
219
220
221    err = pci_device_linux_sysfs_read( dev, config, 0, 256, & bytes );
222    if ( bytes >= 64 ) {
223	struct pci_device_private *priv = (struct pci_device_private *) dev;
224
225	dev->irq = config[60];
226	priv->header_type = config[14];
227
228
229	/* The PCI config registers can be used to obtain information
230	 * about the memory and I/O regions for the device.  However,
231	 * doing so requires some tricky parsing (to correctly handle
232	 * 64-bit memory regions) and requires writing to the config
233	 * registers.  Since we'd like to avoid having to deal with the
234	 * parsing issues and non-root users can write to PCI config
235	 * registers, we use a different file in the device's sysfs
236	 * directory called "resource".
237	 *
238	 * The resource file contains all of the needed information in
239	 * a format that is consistent across all platforms.  Each BAR
240	 * and the expansion ROM have a single line of data containing
241	 * 3, 64-bit hex values:  the first address in the region,
242	 * the last address in the region, and the region's flags.
243	 */
244	snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/resource",
245		  SYS_BUS_PCI,
246		  dev->domain,
247		  dev->bus,
248		  dev->dev,
249		  dev->func );
250	fd = open( name, O_RDONLY | O_CLOEXEC);
251	if ( fd != -1 ) {
252	    char * next;
253	    pciaddr_t  low_addr;
254	    pciaddr_t  high_addr;
255	    pciaddr_t  flags;
256
257
258	    bytes = read( fd, resource, 512 );
259	    resource[511] = '\0';
260
261	    close( fd );
262
263	    next = resource;
264	    for ( i = 0 ; i < 6 ; i++ ) {
265
266		dev->regions[i].base_addr = strtoull( next, & next, 16 );
267		high_addr = strtoull( next, & next, 16 );
268		flags = strtoull( next, & next, 16 );
269
270		if ( dev->regions[i].base_addr != 0 ) {
271		    dev->regions[i].size = (high_addr
272					    - dev->regions[i].base_addr) + 1;
273
274		    dev->regions[i].is_IO = (flags & 0x01);
275		    dev->regions[i].is_64 = (flags & 0x04);
276		    dev->regions[i].is_prefetchable = (flags & 0x08);
277		}
278	    }
279
280	    low_addr = strtoull( next, & next, 16 );
281	    high_addr = strtoull( next, & next, 16 );
282	    flags = strtoull( next, & next, 16 );
283	    if ( low_addr != 0 ) {
284		priv->rom_base = low_addr;
285		dev->rom_size = (high_addr - low_addr) + 1;
286	    }
287	}
288    }
289
290    return err;
291}
292
293
294static int
295pci_device_linux_sysfs_read_rom( struct pci_device * dev, void * buffer )
296{
297    char name[256];
298    int fd;
299    struct stat  st;
300    int err = 0;
301    size_t rom_size;
302    size_t total_bytes;
303
304
305    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/rom",
306	      SYS_BUS_PCI,
307	      dev->domain,
308	      dev->bus,
309	      dev->dev,
310	      dev->func );
311
312    fd = open( name, O_RDWR | O_CLOEXEC);
313    if ( fd == -1 ) {
314#ifdef LINUX_ROM
315	/* If reading the ROM using sysfs fails, fall back to the old
316	 * /dev/mem based interface.
317	 * disable this for newer kernels using configure
318	 */
319	return pci_device_linux_devmem_read_rom(dev, buffer);
320#else
321	return errno;
322#endif
323    }
324
325
326    if ( fstat( fd, & st ) == -1 ) {
327	close( fd );
328	return errno;
329    }
330
331    rom_size = st.st_size;
332    if ( rom_size == 0 )
333	rom_size = 0x10000;
334
335    /* This is a quirky thing on Linux.  Even though the ROM and the file
336     * for the ROM in sysfs are read-only, the string "1" must be written to
337     * the file to enable the ROM.  After the data has been read, "0" must be
338     * written to the file to disable the ROM.
339     */
340    write( fd, "1", 1 );
341    lseek( fd, 0, SEEK_SET );
342
343    for ( total_bytes = 0 ; total_bytes < rom_size ; /* empty */ ) {
344	const int bytes = read( fd, (char *) buffer + total_bytes,
345				rom_size - total_bytes );
346	if ( bytes == -1 ) {
347	    err = errno;
348	    break;
349	}
350	else if ( bytes == 0 ) {
351	    break;
352	}
353
354	total_bytes += bytes;
355    }
356
357
358    lseek( fd, 0, SEEK_SET );
359    write( fd, "0", 1 );
360
361    close( fd );
362    return err;
363}
364
365
366static int
367pci_device_linux_sysfs_read( struct pci_device * dev, void * data,
368			     pciaddr_t offset, pciaddr_t size,
369			     pciaddr_t * bytes_read )
370{
371    char name[256];
372    pciaddr_t temp_size = size;
373    int err = 0;
374    int fd;
375    char *data_bytes = data;
376
377    if ( bytes_read != NULL ) {
378	*bytes_read = 0;
379    }
380
381    /* Each device has a directory under sysfs.  Within that directory there
382     * is a file named "config".  This file used to access the PCI config
383     * space.  It is used here to obtain most of the information about the
384     * device.
385     */
386    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/config",
387	      SYS_BUS_PCI,
388	      dev->domain,
389	      dev->bus,
390	      dev->dev,
391	      dev->func );
392
393    fd = open( name, O_RDONLY | O_CLOEXEC);
394    if ( fd == -1 ) {
395	return errno;
396    }
397
398
399    while ( temp_size > 0 ) {
400	const ssize_t bytes = pread64( fd, data_bytes, temp_size, offset );
401
402	/* If zero bytes were read, then we assume it's the end of the
403	 * config file.
404	 */
405	if (bytes == 0)
406	    break;
407	if ( bytes < 0 ) {
408	    err = errno;
409	    break;
410	}
411
412	temp_size -= bytes;
413	offset += bytes;
414	data_bytes += bytes;
415    }
416
417    if ( bytes_read != NULL ) {
418	*bytes_read = size - temp_size;
419    }
420
421    close( fd );
422    return err;
423}
424
425
426static int
427pci_device_linux_sysfs_write( struct pci_device * dev, const void * data,
428			     pciaddr_t offset, pciaddr_t size,
429			     pciaddr_t * bytes_written )
430{
431    char name[256];
432    pciaddr_t temp_size = size;
433    int err = 0;
434    int fd;
435    const char *data_bytes = data;
436
437    if ( bytes_written != NULL ) {
438	*bytes_written = 0;
439    }
440
441    /* Each device has a directory under sysfs.  Within that directory there
442     * is a file named "config".  This file used to access the PCI config
443     * space.  It is used here to obtain most of the information about the
444     * device.
445     */
446    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/config",
447	      SYS_BUS_PCI,
448	      dev->domain,
449	      dev->bus,
450	      dev->dev,
451	      dev->func );
452
453    fd = open( name, O_WRONLY | O_CLOEXEC);
454    if ( fd == -1 ) {
455	return errno;
456    }
457
458
459    while ( temp_size > 0 ) {
460	const ssize_t bytes = pwrite64( fd, data_bytes, temp_size, offset );
461
462	/* If zero bytes were written, then we assume it's the end of the
463	 * config file.
464	 */
465	if ( bytes == 0 )
466	    break;
467	if ( bytes < 0 ) {
468	    err = errno;
469	    break;
470	}
471
472	temp_size -= bytes;
473	offset += bytes;
474	data_bytes += bytes;
475    }
476
477    if ( bytes_written != NULL ) {
478	*bytes_written = size - temp_size;
479    }
480
481    close( fd );
482    return err;
483}
484
485static int
486pci_device_linux_sysfs_map_range_wc(struct pci_device *dev,
487				    struct pci_device_mapping *map)
488{
489    char name[256];
490    int fd;
491    const int prot = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
492        ? (PROT_READ | PROT_WRITE) : PROT_READ;
493    const int open_flags = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
494        ? O_RDWR : O_RDONLY;
495    const off_t offset = map->base - dev->regions[map->region].base_addr;
496
497    snprintf(name, 255, "%s/%04x:%02x:%02x.%1u/resource%u_wc",
498	     SYS_BUS_PCI,
499	     dev->domain,
500	     dev->bus,
501	     dev->dev,
502	     dev->func,
503	     map->region);
504    fd = open(name, open_flags | O_CLOEXEC);
505    if (fd == -1)
506	    return errno;
507
508    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
509    if (map->memory == MAP_FAILED) {
510        map->memory = NULL;
511	close(fd);
512	return errno;
513    }
514
515    close(fd);
516
517    return 0;
518}
519
520/**
521 * Map a memory region for a device using the Linux sysfs interface.
522 *
523 * \param dev   Device whose memory region is to be mapped.
524 * \param map   Parameters of the mapping that is to be created.
525 *
526 * \return
527 * Zero on success or an \c errno value on failure.
528 *
529 * \sa pci_device_map_rrange, pci_device_linux_sysfs_unmap_range
530 *
531 * \todo
532 * Some older 2.6.x kernels don't implement the resourceN files.  On those
533 * systems /dev/mem must be used.  On these systems it is also possible that
534 * \c mmap64 may need to be used.
535 */
536static int
537pci_device_linux_sysfs_map_range(struct pci_device *dev,
538                                 struct pci_device_mapping *map)
539{
540    char name[256];
541    int fd;
542    int err = 0;
543    const int prot = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
544        ? (PROT_READ | PROT_WRITE) : PROT_READ;
545    const int open_flags = ((map->flags & PCI_DEV_MAP_FLAG_WRITABLE) != 0)
546        ? O_RDWR : O_RDONLY;
547    const off_t offset = map->base - dev->regions[map->region].base_addr;
548#ifdef HAVE_MTRR
549    struct mtrr_sentry sentry = {
550	.base = map->base,
551        .size = map->size,
552	.type = MTRR_TYPE_UNCACHABLE
553    };
554#endif
555
556    /* For WC mappings, try sysfs resourceN_wc file first */
557    if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) &&
558	!pci_device_linux_sysfs_map_range_wc(dev, map))
559	    return 0;
560
561    snprintf(name, 255, "%s/%04x:%02x:%02x.%1u/resource%u",
562             SYS_BUS_PCI,
563             dev->domain,
564             dev->bus,
565             dev->dev,
566             dev->func,
567             map->region);
568
569    fd = open(name, open_flags | O_CLOEXEC);
570    if (fd == -1) {
571        return errno;
572    }
573
574
575    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
576    if (map->memory == MAP_FAILED) {
577        map->memory = NULL;
578	close(fd);
579	return errno;
580    }
581
582#ifdef HAVE_MTRR
583    if ((map->flags & PCI_DEV_MAP_FLAG_CACHABLE) != 0) {
584        sentry.type = MTRR_TYPE_WRBACK;
585    } else if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) != 0) {
586        sentry.type = MTRR_TYPE_WRCOMB;
587    }
588
589    if (pci_sys->mtrr_fd != -1 && sentry.type != MTRR_TYPE_UNCACHABLE) {
590	if (ioctl(pci_sys->mtrr_fd, MTRRIOC_ADD_ENTRY, &sentry) < 0) {
591	    /* FIXME: Should we report an error in this case?
592	     */
593	    fprintf(stderr, "error setting MTRR "
594		    "(base = 0x%08lx, size = 0x%08x, type = %u) %s (%d)\n",
595		    sentry.base, sentry.size, sentry.type,
596		    strerror(errno), errno);
597/*            err = errno;*/
598	}
599	/* KLUDGE ALERT -- rewrite the PTEs to turn off the CD and WT bits */
600	mprotect (map->memory, map->size, PROT_NONE);
601	err = mprotect (map->memory, map->size, PROT_READ|PROT_WRITE);
602
603	if (err != 0) {
604	    fprintf(stderr, "mprotect(PROT_READ | PROT_WRITE) failed: %s\n",
605		    strerror(errno));
606	    fprintf(stderr, "remapping without mprotect performance kludge.\n");
607
608	    munmap(map->memory, map->size);
609	    map->memory = mmap(NULL, map->size, prot, MAP_SHARED, fd, offset);
610	    if (map->memory == MAP_FAILED) {
611		map->memory = NULL;
612		close(fd);
613		return errno;
614	    }
615	}
616    }
617#endif
618
619    close(fd);
620
621    return 0;
622}
623
624/**
625 * Unmap a memory region for a device using the Linux sysfs interface.
626 *
627 * \param dev   Device whose memory region is to be unmapped.
628 * \param map   Parameters of the mapping that is to be destroyed.
629 *
630 * \return
631 * Zero on success or an \c errno value on failure.
632 *
633 * \sa pci_device_map_rrange, pci_device_linux_sysfs_map_range
634 *
635 * \todo
636 * Some older 2.6.x kernels don't implement the resourceN files.  On those
637 * systems /dev/mem must be used.  On these systems it is also possible that
638 * \c mmap64 may need to be used.
639 */
640static int
641pci_device_linux_sysfs_unmap_range(struct pci_device *dev,
642				   struct pci_device_mapping *map)
643{
644    int err = 0;
645#ifdef HAVE_MTRR
646    struct mtrr_sentry sentry = {
647	.base = map->base,
648        .size = map->size,
649	.type = MTRR_TYPE_UNCACHABLE
650    };
651#endif
652
653    err = pci_device_generic_unmap_range (dev, map);
654    if (err)
655	return err;
656
657#ifdef HAVE_MTRR
658    if ((map->flags & PCI_DEV_MAP_FLAG_CACHABLE) != 0) {
659        sentry.type = MTRR_TYPE_WRBACK;
660    } else if ((map->flags & PCI_DEV_MAP_FLAG_WRITE_COMBINE) != 0) {
661        sentry.type = MTRR_TYPE_WRCOMB;
662    }
663
664    if (pci_sys->mtrr_fd != -1 && sentry.type != MTRR_TYPE_UNCACHABLE) {
665	if (ioctl(pci_sys->mtrr_fd, MTRRIOC_DEL_ENTRY, &sentry) < 0) {
666	    /* FIXME: Should we report an error in this case?
667	     */
668	    fprintf(stderr, "error setting MTRR "
669		    "(base = 0x%08lx, size = 0x%08x, type = %u) %s (%d)\n",
670		    sentry.base, sentry.size, sentry.type,
671		    strerror(errno), errno);
672/*            err = errno;*/
673	}
674    }
675#endif
676
677    return err;
678}
679
680static void pci_device_linux_sysfs_enable(struct pci_device *dev)
681{
682    char name[256];
683    int fd;
684
685    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/enable",
686	      SYS_BUS_PCI,
687	      dev->domain,
688	      dev->bus,
689	      dev->dev,
690	      dev->func );
691
692    fd = open( name, O_RDWR | O_CLOEXEC);
693    if (fd == -1)
694       return;
695
696    write( fd, "1", 1 );
697    close(fd);
698}
699
700static int pci_device_linux_sysfs_boot_vga(struct pci_device *dev)
701{
702    char name[256];
703    char reply[3];
704    int fd, bytes_read;
705    int ret = 0;
706
707    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/boot_vga",
708	      SYS_BUS_PCI,
709	      dev->domain,
710	      dev->bus,
711	      dev->dev,
712	      dev->func );
713
714    fd = open( name, O_RDONLY | O_CLOEXEC);
715    if (fd == -1)
716       return 0;
717
718    bytes_read = read(fd, reply, 1);
719    if (bytes_read != 1)
720	goto out;
721    if (reply[0] == '1')
722	ret = 1;
723out:
724    close(fd);
725    return ret;
726}
727
728static int pci_device_linux_sysfs_has_kernel_driver(struct pci_device *dev)
729{
730    char name[256];
731    struct stat dummy;
732    int ret;
733
734    snprintf( name, 255, "%s/%04x:%02x:%02x.%1u/driver",
735	      SYS_BUS_PCI,
736	      dev->domain,
737	      dev->bus,
738	      dev->dev,
739	      dev->func );
740
741    ret = stat(name, &dummy);
742    if (ret < 0)
743	return 0;
744    return 1;
745}
746
747static struct pci_io_handle *
748pci_device_linux_sysfs_open_device_io(struct pci_io_handle *ret,
749				      struct pci_device *dev, int bar,
750				      pciaddr_t base, pciaddr_t size)
751{
752    char name[PATH_MAX];
753
754    snprintf(name, PATH_MAX, "%s/%04x:%02x:%02x.%1u/resource%d",
755	     SYS_BUS_PCI, dev->domain, dev->bus, dev->dev, dev->func, bar);
756
757    ret->fd = open(name, O_RDWR | O_CLOEXEC);
758
759    if (ret->fd < 0)
760	return NULL;
761
762    ret->base = base;
763    ret->size = size;
764    ret->is_legacy = 0;
765
766    return ret;
767}
768
769static struct pci_io_handle *
770pci_device_linux_sysfs_open_legacy_io(struct pci_io_handle *ret,
771				      struct pci_device *dev, pciaddr_t base,
772				      pciaddr_t size)
773{
774    char name[PATH_MAX];
775
776    /* First check if there's a legacy io method for the device */
777    while (dev) {
778	snprintf(name, PATH_MAX, "/sys/class/pci_bus/%04x:%02x/legacy_io",
779		 dev->domain, dev->bus);
780
781	ret->fd = open(name, O_RDWR | O_CLOEXEC);
782	if (ret->fd >= 0)
783	    break;
784
785	dev = pci_device_get_parent_bridge(dev);
786    }
787
788    /*
789     * You would think you'd want to use /dev/port here.  Don't make that
790     * mistake, /dev/port only does byte-wide i/o cycles which means it
791     * doesn't work.  If you think this is stupid, well, you're right.
792     */
793
794    /* If we've no other choice, iopl */
795    if (ret->fd < 0) {
796	if (iopl(3))
797	    return NULL;
798    }
799
800    ret->base = base;
801    ret->size = size;
802    ret->is_legacy = 1;
803
804    return ret;
805}
806
807static void
808pci_device_linux_sysfs_close_io(struct pci_device *dev,
809				struct pci_io_handle *handle)
810{
811    if (handle->fd > -1)
812	close(handle->fd);
813}
814
815static uint32_t
816pci_device_linux_sysfs_read32(struct pci_io_handle *handle, uint32_t port)
817{
818    uint32_t ret;
819
820    if (handle->fd > -1) {
821	if (handle->is_legacy)
822	    pread(handle->fd, &ret, 4, port + handle->base);
823	else
824	    pread(handle->fd, &ret, 4, port);
825    } else {
826	ret = inl(port + handle->base);
827    }
828
829    return ret;
830}
831
832static uint16_t
833pci_device_linux_sysfs_read16(struct pci_io_handle *handle, uint32_t port)
834{
835    uint16_t ret;
836
837    if (handle->fd > -1) {
838	if (handle->is_legacy)
839	    pread(handle->fd, &ret, 2, port + handle->base);
840	else
841	    pread(handle->fd, &ret, 2, port);
842    } else {
843	ret = inw(port + handle->base);
844    }
845
846    return ret;
847}
848
849static uint8_t
850pci_device_linux_sysfs_read8(struct pci_io_handle *handle, uint32_t port)
851{
852    uint8_t ret;
853
854    if (handle->fd > -1) {
855	if (handle->is_legacy)
856	    pread(handle->fd, &ret, 1, port + handle->base);
857	else
858	    pread(handle->fd, &ret, 1, port);
859    } else {
860	ret = inb(port + handle->base);
861    }
862
863    return ret;
864}
865
866static void
867pci_device_linux_sysfs_write32(struct pci_io_handle *handle, uint32_t port,
868			       uint32_t data)
869{
870    if (handle->fd > -1) {
871	if (handle->is_legacy)
872	    pwrite(handle->fd, &data, 4, port + handle->base);
873	else
874	    pwrite(handle->fd, &data, 4, port);
875    } else {
876	outl(data, port + handle->base);
877    }
878}
879
880static void
881pci_device_linux_sysfs_write16(struct pci_io_handle *handle, uint32_t port,
882			       uint16_t data)
883{
884    if (handle->fd > -1) {
885	if (handle->is_legacy)
886	    pwrite(handle->fd, &data, 2, port + handle->base);
887	else
888	    pwrite(handle->fd, &data, 2, port);
889    } else {
890	outw(data, port + handle->base);
891    }
892}
893
894static void
895pci_device_linux_sysfs_write8(struct pci_io_handle *handle, uint32_t port,
896			      uint8_t data)
897{
898    if (handle->fd > -1) {
899	if (handle->is_legacy)
900	    pwrite(handle->fd, &data, 1, port + handle->base);
901	else
902	    pwrite(handle->fd, &data, 1, port);
903    } else {
904	outb(data, port + handle->base);
905    }
906}
907
908static int
909pci_device_linux_sysfs_map_legacy(struct pci_device *dev, pciaddr_t base,
910				  pciaddr_t size, unsigned map_flags, void **addr)
911{
912    char name[PATH_MAX];
913    int flags = O_RDONLY;
914    int prot = PROT_READ;
915    int fd;
916    int ret=0;
917
918    if (map_flags & PCI_DEV_MAP_FLAG_WRITABLE) {
919	flags = O_RDWR; /* O_RDWR != O_WRONLY | O_RDONLY */;
920	prot |= PROT_WRITE;
921    }
922
923    /* First check if there's a legacy memory method for the device */
924    while (dev) {
925	snprintf(name, PATH_MAX, "/sys/class/pci_bus/%04x:%02x/legacy_mem",
926		 dev->domain, dev->bus);
927
928	fd = open(name, flags | O_CLOEXEC);
929	if (fd >= 0)
930	    break;
931
932	dev = pci_device_get_parent_bridge(dev);
933    }
934
935    /* If not, /dev/mem is the best we can do */
936    if (!dev)
937	fd = open("/dev/mem", flags | O_CLOEXEC);
938
939    if (fd < 0)
940	return errno;
941
942    *addr = mmap(NULL, size, prot, MAP_SHARED, fd, base);
943    if (*addr == MAP_FAILED) {
944	ret = errno;
945    }
946
947    close(fd);
948    return ret;
949}
950
951static int
952pci_device_linux_sysfs_unmap_legacy(struct pci_device *dev, void *addr, pciaddr_t size)
953{
954    return munmap(addr, size);
955}
956
957
958static void
959pci_system_linux_destroy(void)
960{
961#ifdef HAVE_MTRR
962	if (pci_sys->mtrr_fd != -1)
963		close(pci_sys->mtrr_fd);
964#endif
965}
966
967static const struct pci_system_methods linux_sysfs_methods = {
968    .destroy = pci_system_linux_destroy,
969    .destroy_device = NULL,
970    .read_rom = pci_device_linux_sysfs_read_rom,
971    .probe = pci_device_linux_sysfs_probe,
972    .map_range = pci_device_linux_sysfs_map_range,
973    .unmap_range = pci_device_linux_sysfs_unmap_range,
974
975    .read = pci_device_linux_sysfs_read,
976    .write = pci_device_linux_sysfs_write,
977
978    .fill_capabilities = pci_fill_capabilities_generic,
979    .enable = pci_device_linux_sysfs_enable,
980    .boot_vga = pci_device_linux_sysfs_boot_vga,
981    .has_kernel_driver = pci_device_linux_sysfs_has_kernel_driver,
982
983    .open_device_io = pci_device_linux_sysfs_open_device_io,
984    .open_legacy_io = pci_device_linux_sysfs_open_legacy_io,
985    .close_io = pci_device_linux_sysfs_close_io,
986    .read32 = pci_device_linux_sysfs_read32,
987    .read16 = pci_device_linux_sysfs_read16,
988    .read8 = pci_device_linux_sysfs_read8,
989    .write32 = pci_device_linux_sysfs_write32,
990    .write16 = pci_device_linux_sysfs_write16,
991    .write8 = pci_device_linux_sysfs_write8,
992
993    .map_legacy = pci_device_linux_sysfs_map_legacy,
994    .unmap_legacy = pci_device_linux_sysfs_unmap_legacy,
995};
996